//
// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2020, 2025, Arm Limited. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 2 only, as
// published by the Free Software Foundation.
//
// This code is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// version 2 for more details (a copy is included in the LICENSE file that
// accompanied this code).
//
// You should have received a copy of the GNU General Public License version
// 2 along with this work; if not, write to the Free Software Foundation,
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
//
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
// or visit www.oracle.com if you need additional information or have any
// questions.
//
//

dnl Generate the warning
// This file is automatically generated by running "m4 aarch64_vector_ad.m4". Do not edit!
dnl

// AArch64 VECTOR Architecture Description File


dnl
dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET($1,            $2      )
dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET(imm_type_abbr, imm_type)
define(`OPERAND_VMEMORYA_IMMEDIATE_OFFSET', `
operand vmemA_imm$1Offset4() %{
  // (esize / msize) = 1
  predicate(Address::offset_ok_for_sve_immed(n->get_$2(), 4,
            Matcher::scalable_vector_reg_size(T_BYTE)));
  match(Con$1);
  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}')dnl
dnl
dnl OPERAND_VMEMORYA_INDIRECT_OFFSET($1           )
dnl OPERAND_VMEMORYA_INDIRECT_OFFSET(imm_type_abbr)
define(`OPERAND_VMEMORYA_INDIRECT_OFFSET', `
operand vmemA_indOff$1`'4(iRegP reg, vmemA_imm$1Offset4 off) %{
  constraint(ALLOC_IN_RC(ptr_reg));
  match(AddP reg off);
  op_cost(0);
  format %{ "[$reg, $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    `index'(0xffffffff);
    scale(0x0);
    disp($off);
  %}
%}')dnl
dnl
// 4 bit signed offset -- for predicated load/store
OPERAND_VMEMORYA_IMMEDIATE_OFFSET(I, int)
OPERAND_VMEMORYA_IMMEDIATE_OFFSET(L, long)
OPERAND_VMEMORYA_INDIRECT_OFFSET(I)
OPERAND_VMEMORYA_INDIRECT_OFFSET(L)

// The indOff of vmemA is valid only when the vector element (load to/store from)
// size equals to memory element (load from/store to) size.
opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4);

source_hpp %{
  // Assert that the given node is not a variable shift.
  bool assert_not_var_shift(const Node* n);

  Assembler::SIMD_Arrangement get_arrangement(const Node* n);
  Assembler::SIMD_RegVariant get_reg_variant(const Node* n);
%}

source %{

  typedef void (C2_MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T,
                                                             PRegister Pg, const Address &adr);

  // Predicated load/store, with optional ptrue to all elements of given predicate register.
  static void loadStoreA_predicated(C2_MacroAssembler* masm, bool is_store, FloatRegister reg,
                                    PRegister pg, BasicType mem_elem_bt, BasicType vector_elem_bt,
                                    int opcode, Register base, int index, int size, int disp) {
    sve_mem_insn_predicate insn;
    int mesize = type2aelembytes(mem_elem_bt);
    if (index == -1) {
      assert(size == 0, "unsupported address mode: scale size = %d", size);
      switch(mesize) {
      case 1:
        insn = is_store ? &C2_MacroAssembler::sve_st1b : &C2_MacroAssembler::sve_ld1b;
        break;
      case 2:
        insn = is_store ? &C2_MacroAssembler::sve_st1h : &C2_MacroAssembler::sve_ld1h;
        break;
      case 4:
        insn = is_store ? &C2_MacroAssembler::sve_st1w : &C2_MacroAssembler::sve_ld1w;
        break;
      case 8:
        insn = is_store ? &C2_MacroAssembler::sve_st1d : &C2_MacroAssembler::sve_ld1d;
        break;
      default:
        assert(false, "unsupported");
        ShouldNotReachHere();
      }
      int imm4 = disp / mesize / Matcher::scalable_vector_reg_size(vector_elem_bt);
      (masm->*insn)(reg, Assembler::elemType_to_regVariant(vector_elem_bt), pg, Address(base, imm4));
    } else {
      assert(false, "unimplemented");
      ShouldNotReachHere();
    }
  }

  bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
    if (UseSVE == 0) {
      // These operations are not profitable to be vectorized on NEON, because no direct
      // NEON instructions support them. They use multiple instructions which is more
      // expensive in almost all cases where we would auto vectorize.
      // But the match rule support for them is profitable for Vector API intrinsics.
      if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
          (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
          (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
          (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
          opcode == Op_MulVL ||
          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
          // They are not suitable for auto-vectorization because the result would not conform
          // to the JLS, Section Evaluation Order.
          // Note: we could implement sequential reductions for these reduction operators, but
          //       this will still almost never lead to speedups, because the sequential
          //       reductions are latency limited along the reduction chain, and not
          //       throughput limited. This is unlike unordered reductions (associative op)
          //       and element-wise ops which are usually throughput limited.
          opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
        return false;
      }
    }
    return match_rule_supported_vector(opcode, vlen, bt);
  }

  // Identify extra cases that we might want to provide match rules for vector nodes and
  // other intrinsics guarded with vector length (vlen) and element type (bt).
  bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
    if (!match_rule_supported(opcode)) {
      return false;
    }

    int length_in_bytes = vlen * type2aelembytes(bt);
    if (UseSVE == 0 && length_in_bytes > FloatRegister::neon_vl) {
      return false;
    }

    // Check whether specific Op is supported.
    // Fail fast, otherwise fall through to common vector_size_supported() check.
    switch (opcode) {
      case Op_AndVMask:
      case Op_OrVMask:
      case Op_XorVMask:
      case Op_MaskAll:
      case Op_VectorMaskGen:
      case Op_LoadVectorMasked:
      case Op_StoreVectorMasked:
      case Op_StoreVectorScatter:
      case Op_StoreVectorScatterMasked:
      case Op_PopulateIndex:
      case Op_CompressM:
      case Op_CompressV:
        if (UseSVE == 0) {
          return false;
        }
        break;
      case Op_LoadVectorGather:
      case Op_LoadVectorGatherMasked:
        if (UseSVE == 0 || is_subword_type(bt)) {
          return false;
        }
        break;
      case Op_MulAddVS2VI:
        if (length_in_bytes != 16) {
          return false;
        }
        break;
      case Op_AddReductionVI:
      case Op_AndReductionV:
      case Op_OrReductionV:
      case Op_XorReductionV:
      case Op_MinReductionV:
      case Op_MaxReductionV:
        // Reductions with less than 8 bytes vector length are
        // not supported.
        if (length_in_bytes < 8) {
          return false;
        }
        break;
      case Op_MulReductionVD:
      case Op_MulReductionVF:
      case Op_MulReductionVI:
      case Op_MulReductionVL:
        // No vector multiply reduction instructions, but we do
        // emit scalar instructions for 64/128-bit vectors.
        if (length_in_bytes != 8 && length_in_bytes != 16) {
          return false;
        }
        break;
      case Op_VectorMaskCmp:
        if (length_in_bytes < 8) {
          return false;
        }
        break;
      case Op_VectorMaskToLong:
        if (UseSVE > 0 && vlen > 64) {
          return false;
        }
        break;
      case Op_VectorLongToMask:
        if (vlen > 64 || !VM_Version::supports_svebitperm()) {
          return false;
        }
        break;
      case Op_CompressBitsV:
      case Op_ExpandBitsV:
        if (!VM_Version::supports_svebitperm()) {
          return false;
        }
        break;
      case Op_AddVHF:
      case Op_SubVHF:
      case Op_MulVHF:
      case Op_DivVHF:
      case Op_MinVHF:
      case Op_MaxVHF:
      case Op_SqrtVHF:
        // FEAT_FP16 is enabled if both "fphp" and "asimdhp" features are supported.
        // Only the Neon instructions need this check. SVE supports half-precision floats
        // by default.
        if (UseSVE == 0 && !is_feat_fp16_supported()) {
          return false;
        }
        break;
      case Op_FmaVHF:
        // UseFMA flag needs to be checked along with FEAT_FP16
        if (!UseFMA || (UseSVE == 0 && !is_feat_fp16_supported())) {
          return false;
        }
        break;
      case Op_SelectFromTwoVector:
        // The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return
        // false if vector length > 16B but supported SVE version < 2.
        // For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else
        // generate Neon "tbl" instruction to select from two vectors.
        // This operation is disabled for doubles and longs on machines with SVE < 2 and instead
        // the default VectorRearrange + VectorBlend is generated because the performance of the default
        // implementation was better than or equal to the implementation for SelectFromTwoVector.
        if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) {
          return false;
        }

        // Because the SVE2 "tbl" instruction is unpredicated and partial operations cannot be generated
        // using masks, we disable this operation on machines where length_in_bytes < MaxVectorSize
        // on that machine with the only exception of 8B vector length. This is because at the time of
        // writing this, there is no SVE2 machine available with length_in_bytes > 8 and
        // length_in_bytes < MaxVectorSize to test this operation on (for example - there isn't an
        // SVE2 machine available with MaxVectorSize = 32 to test a case with length_in_bytes = 16).
        if (UseSVE == 2 && length_in_bytes > 8 && length_in_bytes < MaxVectorSize) {
          return false;
        }
        break;
      default:
        break;
    }
    return vector_size_supported(bt, vlen);
  }

  bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
    // Only SVE supports masked operations.
    if (UseSVE == 0) {
      return false;
    }

    // If an opcode does not support the masked version,
    // unpredicated node with VectorBlend node will be used instead.
    switch(opcode) {
      case Op_VectorRearrange:
      case Op_MulReductionVD:
      case Op_MulReductionVF:
      case Op_MulReductionVI:
      case Op_MulReductionVL:
      case Op_CompressBitsV:
      case Op_ExpandBitsV:
        return false;
      case Op_SaturatingAddV:
      case Op_SaturatingSubV:
        // Only SVE2 supports the predicated saturating instructions.
        if (UseSVE < 2) {
          return false;
        }
        break;
      // We use Op_LoadVectorMasked to implement the predicated Op_LoadVector.
      // Hence we turn to check whether Op_LoadVectorMasked is supported. The
      // same as vector store/gather/scatter.
      case Op_LoadVector:
        opcode = Op_LoadVectorMasked;
        break;
      case Op_StoreVector:
        opcode = Op_StoreVectorMasked;
        break;
      case Op_LoadVectorGather:
        opcode = Op_LoadVectorGatherMasked;
        break;
      case Op_StoreVectorScatter:
        opcode = Op_StoreVectorScatterMasked;
        break;
      // Currently, the masked versions of the following 8 Float16 operations are disabled.
      // When the support for Float16 vector classes is added in VectorAPI and the masked
      // Float16 IR can be generated, these masked operations will be enabled and relevant
      // backend support added.
      case Op_AddVHF:
      case Op_SubVHF:
      case Op_MulVHF:
      case Op_DivVHF:
      case Op_MaxVHF:
      case Op_MinVHF:
      case Op_SqrtVHF:
      case Op_FmaVHF:
        return false;
      default:
        break;
    }

    return match_rule_supported_vector(opcode, vlen, bt);
  }

  bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
    // Only SVE has partial vector operations
    if (UseSVE == 0) {
      return false;
    }

    switch(node->Opcode()) {
      case Op_VectorLoadMask:
      case Op_VectorMaskCmp:
      case Op_LoadVectorGather:
      case Op_StoreVectorScatter:
      case Op_AddReductionVF:
      case Op_AddReductionVD:
      case Op_AndReductionV:
      case Op_OrReductionV:
      case Op_XorReductionV:
      // Mask is needed for partial Op_VectorMaskFirstTrue, because when the
      // input predicate is all-false, the result should be the vector length
      // instead of the vector register size.
      case Op_VectorMaskFirstTrue:
        return true;
      case Op_MaskAll:
        return !node->in(1)->is_Con();
      case Op_LoadVector:
      case Op_StoreVector:
        // We use NEON load/store instructions if the vector length is <= 128 bits.
        return vt->length_in_bytes() > 16;
      case Op_AddReductionVI:
      case Op_AddReductionVL:
        // We may prefer using NEON instructions rather than SVE partial operations.
        return !VM_Version::use_neon_for_vector(vt->length_in_bytes());
      case Op_MinReductionV:
      case Op_MaxReductionV:
        // For BYTE/SHORT/INT/FLOAT/DOUBLE types, we may prefer using NEON
        // instructions rather than SVE partial operations.
        return vt->element_basic_type() == T_LONG ||
               !VM_Version::use_neon_for_vector(vt->length_in_bytes());
      default:
        // For other ops whose vector size is smaller than the max vector size, a
        // full-sized unpredicated operation does not impact the final vector result.
        return false;
    }
  }

  bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
    return false;
  }

  bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) {
    // Only SVE supports the predicate feature.
    if (UseSVE == 0) {
      // On architectures that do not support predicate, masks are stored in
      // general vector registers (TypeVect) with sizes ranging from TypeVectA
      // to TypeVectX based on the vector size in bytes.
      assert(vt->isa_vectmask() == nullptr, "mask type is not matched");
      return false;
    }

    assert(vt->isa_vectmask() != nullptr, "expected TypeVectMask on SVE");
    switch (opcode) {
      case Op_VectorMaskToLong:
      case Op_VectorLongToMask:
        // These operations lack native SVE predicate instructions and are
        // implemented using general vector instructions instead. Use vector
        // registers rather than predicate registers to save the mask for
        // better performance.
        return false;
      default:
        // By default, the mask operations are implemented with predicate
        // instructions with a predicate input/output.
        return true;
    }
  }

  // Assert that the given node is not a variable shift.
  bool assert_not_var_shift(const Node* n) {
    assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
    return true;
  }

  Assembler::SIMD_Arrangement get_arrangement(const Node* n) {
    BasicType bt = Matcher::vector_element_basic_type(n);
    uint length_in_bytes = Matcher::vector_length_in_bytes(n);
    return Assembler::esize2arrangement((uint)type2aelembytes(bt),
                                        /* isQ */ length_in_bytes == 16);
  }

  Assembler::SIMD_RegVariant get_reg_variant(const Node* n) {
    BasicType bt = Matcher::vector_element_basic_type(n);
    return Assembler::elemType_to_regVariant(bt);
  }
%}


// All VECTOR instructions

// ------------------------------ Vector load/store ----------------------------
dnl
dnl VECTOR_LOAD_STORE($1,   $2,     $3,       $4,    $5  )
dnl VECTOR_LOAD_STORE(type, nbytes, arg_name, nbits, size)
define(`VECTOR_LOAD_STORE', `
// ifelse(load, $1, Load, Store) Vector ($4 bits)
instruct $1V$2(vReg $3, vmem$2 mem) %{
  predicate(`n->as_'ifelse(load, $1, Load, Store)Vector()->memory_size() == $2);
  match(Set ifelse(load, $1, dst (LoadVector mem), mem (StoreVector mem src)));
  format %{ "$1V$2 ifelse(load, $1, `$dst, $mem', `$mem, $src')\t# vector ($4 bits)" %}
  ins_encode( `aarch64_enc_'ifelse(load, $1, ldr, str)v$5($3, mem) );
  ins_pipe(pipe_slow);
%}')dnl
dnl
VECTOR_LOAD_STORE(load,  2,  dst, 16,  H)
VECTOR_LOAD_STORE(store, 2,  src, 16,  H)
VECTOR_LOAD_STORE(load,  4,  dst, 32,  S)
VECTOR_LOAD_STORE(store, 4,  src, 32,  S)
VECTOR_LOAD_STORE(load,  8,  dst, 64,  D)
VECTOR_LOAD_STORE(store, 8,  src, 64,  D)
VECTOR_LOAD_STORE(load,  16, dst, 128, Q)
VECTOR_LOAD_STORE(store, 16, src, 128, Q)

// Load Vector (> 128 bits)
instruct loadV(vReg dst, vmemA mem) %{
  predicate(n->as_LoadVector()->memory_size() > 16);
  match(Set dst (LoadVector mem));
  format %{ "loadV $dst, $mem\t# vector (sve)" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    loadStoreA_predicated(masm, /* is_store */ false,
                          $dst$$FloatRegister, ptrue, bt, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}
  ins_pipe(pipe_slow);
%}

// Store Vector (> 128 bits)
instruct storeV(vReg src, vmemA mem) %{
  predicate(n->as_StoreVector()->memory_size() > 16);
  match(Set mem (StoreVector mem src));
  format %{ "storeV $mem, $src\t# vector (sve)" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    loadStoreA_predicated(masm, /* is_store */ true,
                          $src$$FloatRegister, ptrue, bt, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}
  ins_pipe(pipe_slow);
%}

// vector load/store - predicated

instruct loadV_masked(vReg dst, vmemA mem, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst (LoadVectorMasked mem pg));
  format %{ "loadV_masked $dst, $pg, $mem" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    loadStoreA_predicated(masm, /* is_store */ false, $dst$$FloatRegister,
                          $pg$$PRegister, bt, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}
  ins_pipe(pipe_slow);
%}

instruct storeV_masked(vReg src, vmemA mem, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set mem (StoreVectorMasked mem (Binary src pg)));
  format %{ "storeV_masked $mem, $pg, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    loadStoreA_predicated(masm, /* is_store */ true, $src$$FloatRegister,
                          $pg$$PRegister, bt, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}
  ins_pipe(pipe_slow);
%}

// vector load const

instruct vloadcon(vReg dst, immI0 src) %{
  match(Set dst (VectorLoadConst src));
  format %{ "vloadcon $dst, $src\t# load/generate iota indices" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    if (UseSVE == 0) {
      uint length_in_bytes = Matcher::vector_length_in_bytes(this);
      assert(length_in_bytes <= 16, "must be");
      // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 16.
      int offset = exact_log2(type2aelembytes(bt)) << 4;
      if (is_floating_point_type(bt)) {
        offset += 32;
      }
      __ lea(rscratch1, ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + offset));
      if (length_in_bytes == 16) {
        __ ldrq($dst$$FloatRegister, rscratch1);
      } else {
        __ ldrd($dst$$FloatRegister, rscratch1);
      }
    } else {
      Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
      __ sve_index($dst$$FloatRegister, size, 0, 1);
      if (is_floating_point_type(bt)) {
        __ sve_scvtf($dst$$FloatRegister, size, ptrue, $dst$$FloatRegister, size);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

dnl
dnl BINARY_OP($1,        $2,      $3,        $4,       $5  )
dnl BINARY_OP(rule_name, op_name, insn_neon, insn_sve, size)
define(`BINARY_OP', `
instruct $1(vReg dst, vReg src1, vReg src2) %{
  match(Set dst ($2 src1 src2));
  format %{ "$1 $dst, $src1, $src2" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ $3($dst$$FloatRegister, get_arrangement(this),
              $src1$$FloatRegister, $src2$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ $4($dst$$FloatRegister, __ $5, $src1$$FloatRegister, $src2$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl BINARY_OP_PREDICATE($1,        $2,      $3,   $4  )
dnl BINARY_OP_PREDICATE(rule_name, op_name, insn, size)
define(`BINARY_OP_PREDICATE', `
instruct $1_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 ($2 (Binary dst_src1 src2) pg));
  format %{ "$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    __ $3($dst_src1$$FloatRegister, __ $4, $pg$$PRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VADD_IMM($1,   $2,       $3  )
dnl VADD_IMM(type, imm_type, size)
define(`VADD_IMM', `
instruct vaddImm$1(vReg dst_src, $2 con) %{
  predicate(UseSVE > 0);
  match(Set dst_src (AddV$1 dst_src (Replicate con)));
  format %{ "vaddImm$1 $dst_src, $dst_src, $con" %}
  ins_encode %{
    int val = (int)$con$$constant;
    if (val > 0) {
      __ sve_add($dst_src$$FloatRegister, __ $3, val);
    } else {
      __ sve_sub($dst_src$$FloatRegister, __ $3, -val);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// ------------------------------ Vector add -----------------------------------

// vector add
BINARY_OP(vaddB,  AddVB,  addv, sve_add,  B)
BINARY_OP(vaddS,  AddVS,  addv, sve_add,  H)
BINARY_OP(vaddI,  AddVI,  addv, sve_add,  S)
BINARY_OP(vaddL,  AddVL,  addv, sve_add,  D)
BINARY_OP(vaddHF, AddVHF, fadd, sve_fadd, H)
BINARY_OP(vaddF,  AddVF,  fadd, sve_fadd, S)
BINARY_OP(vaddD,  AddVD,  fadd, sve_fadd, D)

// vector add - predicated
BINARY_OP_PREDICATE(vaddB, AddVB, sve_add,  B)
BINARY_OP_PREDICATE(vaddS, AddVS, sve_add,  H)
BINARY_OP_PREDICATE(vaddI, AddVI, sve_add,  S)
BINARY_OP_PREDICATE(vaddL, AddVL, sve_add,  D)
BINARY_OP_PREDICATE(vaddF, AddVF, sve_fadd, S)
BINARY_OP_PREDICATE(vaddD, AddVD, sve_fadd, D)

// vector add reg imm (unpredicated)
VADD_IMM(B, immBAddSubV, B)
VADD_IMM(S, immIAddSubV, H)
VADD_IMM(I, immIAddSubV, S)
VADD_IMM(L, immLAddSubV, D)

// ------------------------------ Vector sub -----------------------------------

// vector sub
BINARY_OP(vsubB,  SubVB,  subv, sve_sub,  B)
BINARY_OP(vsubS,  SubVS,  subv, sve_sub,  H)
BINARY_OP(vsubI,  SubVI,  subv, sve_sub,  S)
BINARY_OP(vsubL,  SubVL,  subv, sve_sub,  D)
BINARY_OP(vsubHF, SubVHF, fsub, sve_fsub, H)
BINARY_OP(vsubF,  SubVF,  fsub, sve_fsub, S)
BINARY_OP(vsubD,  SubVD,  fsub, sve_fsub, D)

// vector sub - predicated
BINARY_OP_PREDICATE(vsubB, SubVB, sve_sub,  B)
BINARY_OP_PREDICATE(vsubS, SubVS, sve_sub,  H)
BINARY_OP_PREDICATE(vsubI, SubVI, sve_sub,  S)
BINARY_OP_PREDICATE(vsubL, SubVL, sve_sub,  D)
BINARY_OP_PREDICATE(vsubF, SubVF, sve_fsub, S)
BINARY_OP_PREDICATE(vsubD, SubVD, sve_fsub, D)

dnl
dnl BINARY_OP_NEON_SVE_PAIRWISE($1,        $2,      $3,        $4,       $5  )
dnl BINARY_OP_NEON_SVE_PAIRWISE(rule_name, op_name, insn_neon, insn_sve, size)
define(`BINARY_OP_NEON_SVE_PAIRWISE', `
instruct $1_neon(vReg dst, vReg src1, vReg src2) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst ($2 src1 src2));
  format %{ "$1_neon $dst, $src1, $src2" %}
  ins_encode %{
    __ $3($dst$$FloatRegister, get_arrangement(this),
            $src1$$FloatRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct $1_sve(vReg dst_src1, vReg src2) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst_src1 ($2 dst_src1 src2));
  format %{ "$1_sve $dst_src1, $dst_src1, $src2" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ $4($dst_src1$$FloatRegister, __ $5, ptrue, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// ------------------------------ Vector mul -----------------------------------

// vector mul - BYTE, CHAR, SHORT, INT
BINARY_OP_NEON_SVE_PAIRWISE(vmulB, MulVB, mulv, sve_mul, B)
BINARY_OP_NEON_SVE_PAIRWISE(vmulS, MulVS, mulv, sve_mul, H)
BINARY_OP_NEON_SVE_PAIRWISE(vmulI, MulVI, mulv, sve_mul, S)

// vector mul - LONG

instruct vmulL_neon(vReg dst, vReg src1, vReg src2) %{
  predicate(UseSVE == 0);
  match(Set dst (MulVL src1 src2));
  format %{ "vmulL_neon $dst, $src1, $src2\t# 2L" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == 16, "must be");
    __ umov(rscratch1, $src1$$FloatRegister, __ D, 0);
    __ umov(rscratch2, $src2$$FloatRegister, __ D, 0);
    __ mul(rscratch2, rscratch2, rscratch1);
    __ mov($dst$$FloatRegister, __ D, 0, rscratch2);
    __ umov(rscratch1, $src1$$FloatRegister, __ D, 1);
    __ umov(rscratch2, $src2$$FloatRegister, __ D, 1);
    __ mul(rscratch2, rscratch2, rscratch1);
    __ mov($dst$$FloatRegister, __ D, 1, rscratch2);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmulL_sve(vReg dst_src1, vReg src2) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (MulVL dst_src1 src2));
  format %{ "vmulL_sve $dst_src1, $dst_src1, $src2" %}
  ins_encode %{
    __ sve_mul($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector mul - floating-point
BINARY_OP(vmulHF, MulVHF, fmul, sve_fmul, H)
BINARY_OP(vmulF,  MulVF,  fmul, sve_fmul, S)
BINARY_OP(vmulD,  MulVD,  fmul, sve_fmul, D)

// vector mul - predicated
BINARY_OP_PREDICATE(vmulB, MulVB, sve_mul,  B)
BINARY_OP_PREDICATE(vmulS, MulVS, sve_mul,  H)
BINARY_OP_PREDICATE(vmulI, MulVI, sve_mul,  S)
BINARY_OP_PREDICATE(vmulL, MulVL, sve_mul,  D)
BINARY_OP_PREDICATE(vmulF, MulVF, sve_fmul, S)
BINARY_OP_PREDICATE(vmulD, MulVD, sve_fmul, D)

// ------------------------------ Vector float div -----------------------------

// vector float div
BINARY_OP_NEON_SVE_PAIRWISE(vdivHF, DivVHF, fdiv, sve_fdiv, H)
BINARY_OP_NEON_SVE_PAIRWISE(vdivF,  DivVF,  fdiv, sve_fdiv, S)
BINARY_OP_NEON_SVE_PAIRWISE(vdivD,  DivVD,  fdiv, sve_fdiv, D)

// vector float div - predicated
BINARY_OP_PREDICATE(vdivF, DivVF, sve_fdiv, S)
BINARY_OP_PREDICATE(vdivD, DivVD, sve_fdiv, D)
dnl
dnl BITWISE_OP_IMM($1,        $2,   $3,      $4,   $5,   $6        )
dnl BITWISE_OP_IMM(rule_name, type, op_name, insn, size, basic_type)
define(`BITWISE_OP_IMM', `
instruct $1(vReg dst_src, imm$2Log con) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == $6);
  match(Set dst_src ($3 dst_src (Replicate con)));
  format %{ "$1 $dst_src, $dst_src, $con" %}
  ins_encode %{
    __ $4($dst_src$$FloatRegister, __ $5, (uint64_t)($con$$constant));
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl BITWISE_OPS($1,        $2,      $3,        $4      )
dnl BITWISE_OPS(rule_name, op_name, insn_neon, insn_sve)
define(`BITWISE_OPS',
`// vector $1

instruct v$1(vReg dst, vReg src1, vReg src2) %{
  match(Set dst ($2 src1 src2));
  format %{ "v$1 $dst, $src1, $src2" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ $3($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
              $src1$$FloatRegister, $src2$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ $4($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

// vector $1 - predicated

instruct v$1_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 ($2 (Binary dst_src1 src2) pg));
  format %{ "v$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ $4($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
               $pg$$PRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector $1 reg imm (unpredicated)
BITWISE_OP_IMM(v$1ImmB, B, $2, $4, B, T_BYTE)
BITWISE_OP_IMM(v$1ImmS, S, $2, $4, H, T_SHORT)
BITWISE_OP_IMM(v$1ImmI, I, $2, $4, S, T_INT)
BITWISE_OP_IMM(v$1ImmL, L, $2, $4, D, T_LONG)')dnl

// ------------------------------ Vector and -----------------------------------

BITWISE_OPS(and, AndV, andr, sve_and)

// ------------------------------ Vector or ------------------------------------

BITWISE_OPS(or, OrV, orr, sve_orr)

// ------------------------------ Vector xor -----------------------------------

BITWISE_OPS(xor, XorV, eor, sve_eor)

// vector eor3 (unpredicated)

instruct veor3_neon(vReg dst, vReg src1, vReg src2, vReg src3) %{
  predicate(VM_Version::supports_sha3() &&
            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst (XorV src1 (XorV src2 src3)));
  format %{ "veor3_neon $dst, $src1, $src2, $src3" %}
  ins_encode %{
    __ eor3($dst$$FloatRegister, __ T16B, $src1$$FloatRegister,
            $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct veor3_sve(vReg dst_src1, vReg src2, vReg src3) %{
  predicate(UseSVE == 2 && !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst_src1 (XorV dst_src1 (XorV src2 src3)));
  format %{ "veor3_sve $dst_src1, $dst_src1, $src2, $src3" %}
  ins_encode %{
    __ sve_eor3($dst_src1$$FloatRegister, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector not -----------------------------------

dnl
dnl VECTOR_NOT($1  )
dnl VECTOR_NOT(type)
define(`VECTOR_NOT', `
instruct vnot$1`'(vReg dst, vReg src, imm$1_M1 m1) %{
  match(Set dst (XorV src (Replicate m1)));
  format %{ "vnot$1 $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ notr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
              $src$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_not($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// vector not
VECTOR_NOT(I)
VECTOR_NOT(L)
dnl
dnl VECTOR_NOT_PREDICATE($1  )
dnl VECTOR_NOT_PREDICATE(type)
define(`VECTOR_NOT_PREDICATE', `
instruct vnot$1_masked`'(vReg dst_src, imm$1_M1 m1, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src (XorV (Binary dst_src (Replicate m1)) pg));
  format %{ "vnot$1_masked $dst_src, $pg, $dst_src" %}
  ins_encode %{
    __ sve_not($dst_src$$FloatRegister, get_reg_variant(this),
               $pg$$PRegister, $dst_src$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl

// vector not - predicated
VECTOR_NOT_PREDICATE(I)
VECTOR_NOT_PREDICATE(L)

dnl
// ------------------------------ Vector and_not -------------------------------
dnl
dnl VECTOR_AND_NOT($1  )
dnl VECTOR_AND_NOT(type)
define(`VECTOR_AND_NOT', `
instruct vand_not$1`'(vReg dst, vReg src1, vReg src2, imm$1_M1 m1) %{
  match(Set dst (AndV src1 (XorV src2 (Replicate m1))));
  format %{ "vand_not$1 $dst, $src1, $src2" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ bic($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
             $src1$$FloatRegister, $src2$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_bic($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl

// vector and_not
VECTOR_AND_NOT(I)
VECTOR_AND_NOT(L)
dnl
dnl VECTOR_AND_NOT_PREDICATE($1  )
dnl VECTOR_AND_NOT_PREDICATE(type)
define(`VECTOR_AND_NOT_PREDICATE', `
instruct vand_not$1_masked`'(vReg dst_src1, vReg src2, imm$1_M1 m1, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (AndV (Binary dst_src1 (XorV src2 (Replicate m1))) pg));
  format %{ "vand_not$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    __ sve_bic($dst_src1$$FloatRegister, get_reg_variant(this),
               $pg$$PRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl

// vector and_not - predicated
VECTOR_AND_NOT_PREDICATE(I)
VECTOR_AND_NOT_PREDICATE(L)

dnl
dnl VECTOR_SATURATING_OP($1,     $2, $3     )
dnl VECTOR_SATURATING_OP(prefix, op, op_name)
define(`VECTOR_SATURATING_OP', `
instruct v$1$2(vReg dst, vReg src1, vReg src2) %{
  predicate(ifelse($1, sq, `!',`')n->as_SaturatingVector()->is_unsigned());
  match(Set dst ($3 src1 src2));
  format %{ "v$1$2 $dst, $src1, $src2" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ $1$2v($dst$$FloatRegister, get_arrangement(this),
                $src1$$FloatRegister, $src2$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      BasicType bt = Matcher::vector_element_basic_type(this);
      __ sve_$1$2($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                   $src1$$FloatRegister, $src2$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl
dnl VECTOR_SATURATING_PREDICATE($1,     $2, $3     )
dnl VECTOR_SATURATING_PREDICATE(prefix, op, op_name)
define(`VECTOR_SATURATING_PREDICATE', `
instruct v$1$2_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
  predicate(UseSVE == 2 && ifelse($1, sq, `!',`')n->as_SaturatingVector()->is_unsigned());
  match(Set dst_src1 ($3 (Binary dst_src1 src2) pg));
  format %{ "v$1$2_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_$1$2($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                 $pg$$PRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// ------------------------- Vector saturating add -----------------------------

// Signed saturating add
VECTOR_SATURATING_OP(sq, add, SaturatingAddV)
VECTOR_SATURATING_PREDICATE(sq, add, SaturatingAddV)

// Unsigned saturating add
VECTOR_SATURATING_OP(uq, add, SaturatingAddV)
VECTOR_SATURATING_PREDICATE(uq, add, SaturatingAddV)

// ------------------------- Vector saturating sub -----------------------------

// Signed saturating sub
VECTOR_SATURATING_OP(sq, sub, SaturatingSubV)
VECTOR_SATURATING_PREDICATE(sq, sub, SaturatingSubV)

// Unsigned saturating sub
VECTOR_SATURATING_OP(uq, sub, SaturatingSubV)
VECTOR_SATURATING_PREDICATE(uq, sub, SaturatingSubV)

dnl
dnl UNARY_OP($1,        $2,      $3,        $4,       $5  )
dnl UNARY_OP(rule_name, op_name, insn_neon, insn_sve, size)
define(`UNARY_OP', `
instruct $1(vReg dst, vReg src) %{
  match(Set dst ($2 src));
  format %{ "$1 $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ $3($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ $4($dst$$FloatRegister, __ $5, ptrue, $src$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl UNARY_OP_PREDICATE($1,        $2,      $3  )
dnl UNARY_OP_PREDICATE(rule_name, op_name, insn)
define(`UNARY_OP_PREDICATE', `
instruct $1_masked(vReg dst_src, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src ($2 dst_src pg));
  format %{ "$1_masked $dst_src, $pg, $dst_src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ $3($dst_src$$FloatRegister, __ elemType_to_regVariant(bt),
               $pg$$PRegister, $dst_src$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl UNARY_OP_PREDICATE_WITH_SIZE($1,        $2,      $3,   $4  )
dnl UNARY_OP_PREDICATE_WITH_SIZE(rule_name, op_name, insn, size)
define(`UNARY_OP_PREDICATE_WITH_SIZE', `
instruct $1_masked(vReg dst_src, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src ($2 dst_src pg));
  format %{ "$1_masked $dst_src, $pg, $dst_src" %}
  ins_encode %{
    __ $3($dst_src$$FloatRegister, __ $4, $pg$$PRegister, $dst_src$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// ------------------------------ Vector abs -----------------------------------

// vector abs
UNARY_OP(vabsB, AbsVB, absr, sve_abs,  B)
UNARY_OP(vabsS, AbsVS, absr, sve_abs,  H)
UNARY_OP(vabsI, AbsVI, absr, sve_abs,  S)
UNARY_OP(vabsL, AbsVL, absr, sve_abs,  D)
UNARY_OP(vabsF, AbsVF, fabs, sve_fabs, S)
UNARY_OP(vabsD, AbsVD, fabs, sve_fabs, D)

// vector abs - predicated
UNARY_OP_PREDICATE_WITH_SIZE(vabsB, AbsVB, sve_abs,  B)
UNARY_OP_PREDICATE_WITH_SIZE(vabsS, AbsVS, sve_abs,  H)
UNARY_OP_PREDICATE_WITH_SIZE(vabsI, AbsVI, sve_abs,  S)
UNARY_OP_PREDICATE_WITH_SIZE(vabsL, AbsVL, sve_abs,  D)
UNARY_OP_PREDICATE_WITH_SIZE(vabsF, AbsVF, sve_fabs, S)
UNARY_OP_PREDICATE_WITH_SIZE(vabsD, AbsVD, sve_fabs, D)

// ------------------------------ Vector fabd ----------------------------------

// vector fabs diff

instruct vfabd_neon(vReg dst, vReg src1, vReg src2) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst (AbsVF (SubVF src1 src2)));
  match(Set dst (AbsVD (SubVD src1 src2)));
  format %{ "vfabd_neon $dst, $src1, $src2" %}
  ins_encode %{
    __ fabd($dst$$FloatRegister, get_arrangement(this),
            $src1$$FloatRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vfabd_sve(vReg dst_src1, vReg src2) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst_src1 (AbsVF (SubVF dst_src1 src2)));
  match(Set dst_src1 (AbsVD (SubVD dst_src1 src2)));
  format %{ "vfabd_sve $dst_src1, $dst_src1, $src2" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fabd($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                ptrue, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector fabs diff - predicated

instruct vfabd_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (AbsVF (SubVF (Binary dst_src1 src2) pg) pg));
  match(Set dst_src1 (AbsVD (SubVD (Binary dst_src1 src2) pg) pg));
  format %{ "vfabd_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fabd($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                $pg$$PRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector neg -----------------------------------

// vector neg

instruct vnegI(vReg dst, vReg src) %{
  match(Set dst (NegVI src));
  format %{ "vnegI $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ negr($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      BasicType bt = Matcher::vector_element_basic_type(this);
      __ sve_neg($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                 ptrue, $src$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}
UNARY_OP(vnegL, NegVL, negr, sve_neg,  D)
UNARY_OP(vnegF, NegVF, fneg, sve_fneg, S)
UNARY_OP(vnegD, NegVD, fneg, sve_fneg, D)

// vector neg - predicated
UNARY_OP_PREDICATE(vnegI, NegVI, sve_neg)
UNARY_OP_PREDICATE_WITH_SIZE(vnegL, NegVL, sve_neg,  D)
UNARY_OP_PREDICATE_WITH_SIZE(vnegF, NegVF, sve_fneg, S)
UNARY_OP_PREDICATE_WITH_SIZE(vnegD, NegVD, sve_fneg, D)

// ------------------------------ Vector sqrt ----------------------------------

// vector sqrt
UNARY_OP(vsqrtHF, SqrtVHF, fsqrt, sve_fsqrt, H)
UNARY_OP(vsqrtF,  SqrtVF,  fsqrt, sve_fsqrt, S)
UNARY_OP(vsqrtD,  SqrtVD,  fsqrt, sve_fsqrt, D)

// vector sqrt - predicated
UNARY_OP_PREDICATE_WITH_SIZE(vsqrtF, SqrtVF, sve_fsqrt, S)
UNARY_OP_PREDICATE_WITH_SIZE(vsqrtD, SqrtVD, sve_fsqrt, D)

dnl
dnl VMINMAX_L_NEON($1,   $2     , $3  )
dnl VMINMAX_L_NEON(type, op_name, sign)
define(`VMINMAX_L_NEON', `
instruct v$3$1L_neon(vReg dst, vReg src1, vReg src2) %{
  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_LONG);
  match(Set dst ($2 src1 src2));
  effect(TEMP_DEF dst);
  format %{ "v$3$1L_neon $dst, $src1, $src2\t# 2L" %}
  ins_encode %{
    __ cm(Assembler::ifelse($3, u, HI, GT), $dst$$FloatRegister, __ T2D, $src1$$FloatRegister, $src2$$FloatRegister);
    __ bsl($dst$$FloatRegister, __ T16B, ifelse($1, min, $src2, $src1)$$FloatRegister, ifelse(min, $1, $src1, $src2)$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMINMAX_L_SVE($1,   $2,      $3  )
dnl VMINMAX_L_SVE(type, op_name, insn)
define(`VMINMAX_L_SVE', `
instruct v$1L_sve(vReg dst_src1, vReg src2) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG);
  match(Set dst_src1 ($2 dst_src1 src2));
  format %{ "v$1L_sve $dst_src1, $dst_src1, $src2" %}
  ins_encode %{
    __ $3($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMINMAX_NEON($1,   $2,      $3,      $4           )
dnl VMINMAX_NEON(type, op_name, insn_fp, insn_integral)
define(`VMINMAX_NEON', `
instruct v$1_neon(vReg dst, vReg src1, vReg src2) %{
  predicate(Matcher::vector_element_basic_type(n) != T_LONG &&
            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst ($2 src1 src2));
  format %{ "v$1_neon $dst, $src1, $src2\t# B/S/I/F/D" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    if (is_floating_point_type(bt)) {
      __ $3($dst$$FloatRegister, get_arrangement(this),
              $src1$$FloatRegister, $src2$$FloatRegister);
    } else {
      assert(is_integral_type(bt) && bt != T_LONG, "unsupported type");
      __ $4($dst$$FloatRegister, get_arrangement(this),
              $src1$$FloatRegister, $src2$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMINMAX_HF_NEON($1,   $2,      $3     )
dnl VMINMAX_HF_NEON(type, op_name, insn_fp)
define(`VMINMAX_HF_NEON', `
instruct v$1_HF_neon(vReg dst, vReg src1, vReg src2) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst ($2 src1 src2));
  format %{ "v$1_HF_neon $dst, $src1, $src2\t# Half float" %}
  ins_encode %{
    __ $3($dst$$FloatRegister, get_arrangement(this),
            $src1$$FloatRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMINMAX_SVE($1,   $2,      $3,      $4           )
dnl VMINMAX_SVE(type, op_name, insn_fp, insn_integral)
define(`VMINMAX_SVE', `
instruct v$1_sve(vReg dst_src1, vReg src2) %{
  predicate(Matcher::vector_element_basic_type(n) != T_LONG &&
            !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst_src1 ($2 dst_src1 src2));
  format %{ "v$1_sve $dst_src1, $dst_src1, $src2\t# B/S/I/F/D" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    if (is_floating_point_type(bt)) {
      __ $3($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                  ptrue, $src2$$FloatRegister);
    } else {
      assert(is_integral_type(bt) && bt != T_LONG, "unsupported type");
      __ $4($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                  ptrue, $src2$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMINMAX_HF_SVE($1,   $2,      $3     )
dnl VMINMAX_HF_SVE(type, op_name, insn_fp)
define(`VMINMAX_HF_SVE', `
instruct v$1_HF_sve(vReg dst_src1, vReg src2) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst_src1 ($2 dst_src1 src2));
  format %{ "v$1_HF_sve $dst_src1, $dst_src1, $src2\t# Half float" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ $3($dst_src1$$FloatRegister, __ H,
                ptrue, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMINMAX_PREDICATE($1,   $2,      $3,      $4           )
dnl VMINMAX_PREDICATE(type, op_name, insn_fp, insn_integral)
define(`VMINMAX_PREDICATE', `
instruct v$1_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 ($2 (Binary dst_src1 src2) pg));
  format %{ "v$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    if (is_floating_point_type(bt)) {
      __ $3($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                  $pg$$PRegister, $src2$$FloatRegister);
    } else {
      assert(is_integral_type(bt), "unsupported type");
      __ $4($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                  $pg$$PRegister, $src2$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VUMINMAX_NEON($1,   $2,      $3  )
dnl VUMINMAX_NEON(type, op_name, insn)
define(`VUMINMAX_NEON', `
instruct v$1_neon(vReg dst, vReg src1, vReg src2) %{
  predicate(Matcher::vector_element_basic_type(n) != T_LONG &&
            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst ($2 src1 src2));
  format %{ "v$1_neon $dst, $src1, $src2\t# B/S/I" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    assert(is_integral_type(bt) && bt != T_LONG, "unsupported type");
    __ $3($dst$$FloatRegister, get_arrangement(this),
             $src1$$FloatRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VUMINMAX_SVE($1,   $2,      $3  )
dnl VUMINMAX_SVE(type, op_name, insn)
define(`VUMINMAX_SVE', `
instruct v$1_sve(vReg dst_src1, vReg src2) %{
  predicate(Matcher::vector_element_basic_type(n) != T_LONG &&
            !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst_src1 ($2 dst_src1 src2));
  format %{ "v$1_sve $dst_src1, $dst_src1, $src2\t# B/S/I" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    assert(is_integral_type(bt) && bt != T_LONG, "unsupported type");
    __ $3($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                ptrue, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VUMINMAX_PREDICATE($1,   $2,      $3  )
dnl VUMINMAX_PREDICATE(type, op_name, insn)
define(`VUMINMAX_PREDICATE', `
instruct v$1_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 ($2 (Binary dst_src1 src2) pg));
  format %{ "v$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    assert(is_integral_type(bt), "unsupported type");
    __ $3($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                $pg$$PRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// ------------------------------ Vector min -----------------------------------

// vector min - LONG
VMINMAX_L_NEON(min, MinV)
VMINMAX_L_SVE(min, MinV, sve_smin)

// vector min - B/S/I/HF/F/D
VMINMAX_NEON(min, MinV, fmin, minv)
VMINMAX_SVE(min, MinV, sve_fmin, sve_smin)
VMINMAX_HF_NEON(min, MinVHF, fmin)
VMINMAX_HF_SVE(min, MinVHF, sve_fmin)

// vector min - predicated
VMINMAX_PREDICATE(min, MinV, sve_fmin, sve_smin)

// vector unsigned min - LONG
VMINMAX_L_NEON(min, UMinV, u)
VMINMAX_L_SVE(umin, UMinV, sve_umin)

// vector unsigned min - B/S/I
VUMINMAX_NEON(umin, UMinV, uminv)
VUMINMAX_SVE(umin, UMinV, sve_umin)

// vector unsigned min - predicated
VUMINMAX_PREDICATE(umin, UMinV, sve_umin)

// ------------------------------ Vector max -----------------------------------

// vector max - LONG
VMINMAX_L_NEON(max, MaxV)
VMINMAX_L_SVE(max, MaxV, sve_smax)

// vector max - B/S/I/HF/F/D
VMINMAX_NEON(max, MaxV, fmax, maxv)
VMINMAX_SVE(max, MaxV, sve_fmax, sve_smax)
VMINMAX_HF_NEON(max, MaxVHF, fmax)
VMINMAX_HF_SVE(max, MaxVHF, sve_fmax)

// vector max - predicated
VMINMAX_PREDICATE(max, MaxV, sve_fmax, sve_smax)

// vector unsigned max - LONG
VMINMAX_L_NEON(max, UMaxV, u)
VMINMAX_L_SVE(umax, UMaxV, sve_umax)

// vector unsigned max - B/S/I
VUMINMAX_NEON(umax, UMaxV, umaxv)
VUMINMAX_SVE(umax, UMaxV, sve_umax)

// vector unsigned max - predicated
VUMINMAX_PREDICATE(umax, UMaxV, sve_umax)

// ------------------------------ MLA RELATED ----------------------------------

// vector mla
// dst_src1 = dst_src1 + src2 * src3

instruct vmla(vReg dst_src1, vReg src2, vReg src3) %{
  match(Set dst_src1 (AddVB dst_src1 (MulVB src2 src3)));
  match(Set dst_src1 (AddVS dst_src1 (MulVS src2 src3)));
  match(Set dst_src1 (AddVI dst_src1 (MulVI src2 src3)));
  format %{ "vmla $dst_src1, $src2, $src3" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ mlav($dst_src1$$FloatRegister, get_arrangement(this),
              $src2$$FloatRegister, $src3$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_mla($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                 ptrue, $src2$$FloatRegister, $src3$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vmlaL(vReg dst_src1, vReg src2, vReg src3) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (AddVL dst_src1 (MulVL src2 src3)));
  format %{ "vmlaL $dst_src1, $src2, $src3" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_mla($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
               ptrue, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmla_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (AddVB (Binary dst_src1 (MulVB src2 src3)) pg));
  match(Set dst_src1 (AddVS (Binary dst_src1 (MulVS src2 src3)) pg));
  match(Set dst_src1 (AddVI (Binary dst_src1 (MulVI src2 src3)) pg));
  match(Set dst_src1 (AddVL (Binary dst_src1 (MulVL src2 src3)) pg));
  format %{ "vmla_masked $dst_src1, $pg, $src2, $src3" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_mla($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
               $pg$$PRegister, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector fmla
// dst_src1 = src2 * src3 + dst_src1

instruct vfmla(vReg dst_src1, vReg src2, vReg src3) %{
  match(Set dst_src1 (FmaVHF dst_src1 (Binary src2 src3)));
  match(Set dst_src1 (FmaVF  dst_src1 (Binary src2 src3)));
  match(Set dst_src1 (FmaVD  dst_src1 (Binary src2 src3)));
  format %{ "vfmla $dst_src1, $src2, $src3" %}
  ins_encode %{
    assert(UseFMA, "Needs FMA instructions support.");
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ fmla($dst_src1$$FloatRegister, get_arrangement(this),
              $src2$$FloatRegister, $src3$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      BasicType bt = Matcher::vector_element_basic_type(this);
      __ sve_fmla($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                  ptrue, $src2$$FloatRegister, $src3$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

// vector fmad - predicated
// dst_src1 = dst_src1 * src2 + src3

instruct vfmad_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (FmaVF (Binary dst_src1 src2) (Binary src3 pg)));
  match(Set dst_src1 (FmaVD (Binary dst_src1 src2) (Binary src3 pg)));
  format %{ "vfmad_masked $dst_src1, $pg, $src2, $src3" %}
  ins_encode %{
    assert(UseFMA, "Needs FMA instructions support.");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fmad($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                $pg$$PRegister, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector mls
// dst_src1 = dst_src1 - src2 * src3

instruct vmls(vReg dst_src1, vReg src2, vReg src3) %{
  match(Set dst_src1 (SubVB dst_src1 (MulVB src2 src3)));
  match(Set dst_src1 (SubVS dst_src1 (MulVS src2 src3)));
  match(Set dst_src1 (SubVI dst_src1 (MulVI src2 src3)));
  format %{ "vmls $dst_src1, $src2, $src3" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ mlsv($dst_src1$$FloatRegister, get_arrangement(this),
              $src2$$FloatRegister, $src3$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_mls($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                 ptrue, $src2$$FloatRegister, $src3$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vmlsL(vReg dst_src1, vReg src2, vReg src3) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (SubVL dst_src1 (MulVL src2 src3)));
  format %{ "vmlsL $dst_src1, $src2, $src3" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_mls($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                 ptrue, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmls_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (SubVB (Binary dst_src1 (MulVB src2 src3)) pg));
  match(Set dst_src1 (SubVS (Binary dst_src1 (MulVS src2 src3)) pg));
  match(Set dst_src1 (SubVI (Binary dst_src1 (MulVI src2 src3)) pg));
  match(Set dst_src1 (SubVL (Binary dst_src1 (MulVL src2 src3)) pg));
  format %{ "vmls_masked $dst_src1, $pg, $src2, $src3" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_mls($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
               $pg$$PRegister, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector fmls

// dst_src1 = src2 * (-src3) + dst_src1
// "(-src2) * src3 + dst_src1" has been idealized to "src3 * (-src2) + dst_src1"
instruct vfmls(vReg dst_src1, vReg src2, vReg src3) %{
  match(Set dst_src1 (FmaVF dst_src1 (Binary src2 (NegVF src3))));
  match(Set dst_src1 (FmaVD dst_src1 (Binary src2 (NegVD src3))));
  format %{ "vfmls $dst_src1, $src2, $src3" %}
  ins_encode %{
    assert(UseFMA, "Needs FMA instructions support.");
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ fmls($dst_src1$$FloatRegister, get_arrangement(this),
              $src2$$FloatRegister, $src3$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      BasicType bt = Matcher::vector_element_basic_type(this);
      __ sve_fmls($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                  ptrue, $src2$$FloatRegister, $src3$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

// vector fmsb - predicated

// dst_src1 = dst_src1 * (-src2) + src3
instruct vfmsb_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (FmaVF (Binary dst_src1 (NegVF src2)) (Binary src3 pg)));
  match(Set dst_src1 (FmaVD (Binary dst_src1 (NegVD src2)) (Binary src3 pg)));
  format %{ "vfmsb_masked $dst_src1, $pg, $src2, $src3" %}
  ins_encode %{
    assert(UseFMA, "Needs FMA instructions support.");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fmsb($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                $pg$$PRegister, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector fnmla (sve)

// dst_src1 = src2 * (-src3) - dst_src1
// "(-src2) * src3 - dst_src1" has been idealized to "src3 * (-src2) - dst_src1"
instruct vfnmla(vReg dst_src1, vReg src2, vReg src3) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 (NegVF src3))));
  match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 (NegVD src3))));
  format %{ "vfnmla $dst_src1, $src2, $src3" %}
  ins_encode %{
    assert(UseFMA, "Needs FMA instructions support.");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fnmla($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                 ptrue, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector fnmad - predicated

// dst_src1 = dst_src1 * (-src2) - src3
instruct vfnmad_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (FmaVF (Binary dst_src1 (NegVF src2)) (Binary (NegVF src3) pg)));
  match(Set dst_src1 (FmaVD (Binary dst_src1 (NegVD src2)) (Binary (NegVD src3) pg)));
  format %{ "vfnmad_masked $dst_src1, $pg, $src2, $src3" %}
  ins_encode %{
    assert(UseFMA, "Needs FMA instructions support.");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fnmad($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                 $pg$$PRegister, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector fnmls (sve)

// dst_src1 = src2 * src3 - dst_src1
instruct vfnmls(vReg dst_src1, vReg src2, vReg src3) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 src3)));
  match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 src3)));
  format %{ "vfnmls $dst_src1, $src2, $src3" %}
  ins_encode %{
    assert(UseFMA, "Needs FMA instructions support.");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fnmls($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                 ptrue, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector fnmsb - predicated

// dst_src1 = dst_src1 * src2 - src3
instruct vfnmsb_masked(vReg dst_src1, vReg src2, vReg src3, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (FmaVF (Binary dst_src1 src2) (Binary (NegVF src3) pg)));
  match(Set dst_src1 (FmaVD (Binary dst_src1 src2) (Binary (NegVD src3) pg)));
  format %{ "vfnmsb_masked $dst_src1, $pg, $src2, $src3" %}
  ins_encode %{
    assert(UseFMA, "Needs FMA instructions support.");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fnmsb($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
                 $pg$$PRegister, $src2$$FloatRegister, $src3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// MulAddVS2VI
// Vector Multiply-Add Shorts into Integer

instruct vmuladdS2I(vReg dst, vReg src1, vReg src2, vReg tmp) %{
  predicate(Matcher::vector_length_in_bytes(n) == 16 &&
            Matcher::vector_element_basic_type(n->in(1)) == T_SHORT);
  match(Set dst (MulAddVS2VI src1 src2));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vmuladdS2I $dst, $src1, $src2\t# KILL $tmp" %}
  ins_encode %{
    __ smullv($tmp$$FloatRegister, __ T4H, $src1$$FloatRegister, $src2$$FloatRegister);
    __ smullv($dst$$FloatRegister, __ T8H, $src1$$FloatRegister, $src2$$FloatRegister);
    __ addpv($dst$$FloatRegister, __ T4S, $tmp$$FloatRegister, $dst$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector shift ---------------------------------

// Vector right shift in AArch64 ASIMD
//
// Right shifts with vector shift count on AArch64 ASIMD are implemented
// as left shift by negative shift count.
// There are two cases for vector shift count.
//
// Case 1: The vector shift count is from replication.
//        |            |
//    LoadVector  RShiftCntV
//        |       /
//     RShiftVI
//
// Case 2: The vector shift count is from loading.
// This case isn't supported by middle-end now. But it's supported by
// panama/vectorIntrinsics(JEP 338: Vector API).
//        |            |
//    LoadVector  LoadVector
//        |       /
//     RShiftVI
//
// The negate is conducted in RShiftCntV rule for case 1, whereas it's done in
// RShiftV* rules for case 2. Because there exists an optimization opportunity
// for case 1, that is, multiple neg instructions in inner loop can be hoisted
// to outer loop and merged into one neg instruction.
//
// Note that ShiftVNode::is_var_shift() indicates whether the vector shift
// count is a variable vector(case 2) or not(a vector generated by RShiftCntV,
// i.e. case 1).

// vector shift count

instruct vshiftcntL(vReg dst, iRegIorL2I cnt) %{
  match(Set dst (LShiftCntV cnt));
  format %{ "vshiftcntL $dst, $cnt" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ dup($dst$$FloatRegister, get_arrangement(this), $cnt$$Register);
    } else {
      assert(UseSVE > 0, "must be sve");
      BasicType bt = Matcher::vector_element_basic_type(this);
      __ sve_dup($dst$$FloatRegister, __ elemType_to_regVariant(bt), $cnt$$Register);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vshiftcntR(vReg dst, iRegIorL2I cnt) %{
  match(Set dst (RShiftCntV cnt));
  format %{ "vshiftcntR $dst, $cnt" %}
  ins_encode %{
    if (UseSVE == 0) {
      uint length_in_bytes = Matcher::vector_length_in_bytes(this);
      assert(length_in_bytes <= 16, "must be");
      __ negw(rscratch1, $cnt$$Register);
      __ dup($dst$$FloatRegister, get_arrangement(this), rscratch1);
    } else {
      BasicType bt = Matcher::vector_element_basic_type(this);
      __ sve_dup($dst$$FloatRegister, __ elemType_to_regVariant(bt), $cnt$$Register);
    }
  %}
  ins_pipe(pipe_slow);
%}

// vector shift left

instruct vlsl_neon(vReg dst, vReg src, vReg shift) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst (LShiftVB src shift));
  match(Set dst (LShiftVS src shift));
  match(Set dst (LShiftVI src shift));
  match(Set dst (LShiftVL src shift));
  format %{ "vlsl_neon $dst, $src, $shift" %}
  ins_encode %{
    __ sshl($dst$$FloatRegister, get_arrangement(this),
            $src$$FloatRegister, $shift$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vlsl_sve(vReg dst_src, vReg shift) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
  match(Set dst_src (LShiftVB dst_src shift));
  match(Set dst_src (LShiftVS dst_src shift));
  match(Set dst_src (LShiftVI dst_src shift));
  match(Set dst_src (LShiftVL dst_src shift));
  format %{ "vlsl_sve $dst_src, $dst_src, $shift" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_lsl($dst_src$$FloatRegister, __ elemType_to_regVariant(bt),
               ptrue, $shift$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

dnl
dnl VRSHIFT_NEON($1,   $2,      $3  )
dnl VRSHIFT_NEON(type, op_name, insn)
define(`VRSHIFT_NEON', `
instruct v$1_neon(vReg dst, vReg src, vReg shift) %{
  predicate(UseSVE == 0 && !n->as_ShiftV()->is_var_shift());
  match(Set dst ($2VB src shift));
  match(Set dst ($2VS src shift));
  match(Set dst ($2VI src shift));
  match(Set dst ($2VL src shift));
  format %{ "v$1_neon $dst, $src, $shift\t# not variable shift" %}
  ins_encode %{
    __ $3($dst$$FloatRegister, get_arrangement(this),
            $src$$FloatRegister, $shift$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VRSHIFT_NEON_VAR($1,   $2,      $3  )
dnl VRSHIFT_NEON_VAR(type, op_name, insn)
define(`VRSHIFT_NEON_VAR', `
instruct v$1_neon_var(vReg dst, vReg src, vReg shift) %{
  predicate(UseSVE == 0 && n->as_ShiftV()->is_var_shift());
  match(Set dst ($2VB src shift));
  match(Set dst ($2VS src shift));
  match(Set dst ($2VI src shift));
  match(Set dst ($2VL src shift));
  effect(TEMP_DEF dst);
  format %{ "v$1_neon_var $dst, $src, $shift\t# variable shift" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    __ negr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
            $shift$$FloatRegister);
    __ $3($dst$$FloatRegister, get_arrangement(this),
            $src$$FloatRegister, $dst$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VRSHIFT_SVE($1,   $2,      $3  )
dnl VRSHIFT_SVE(type, op_name, insn)
define(`VRSHIFT_SVE', `
instruct v$1_sve(vReg dst_src, vReg shift) %{
  predicate(UseSVE > 0);
  match(Set dst_src ($2VB dst_src shift));
  match(Set dst_src ($2VS dst_src shift));
  match(Set dst_src ($2VI dst_src shift));
  match(Set dst_src ($2VL dst_src shift));
  format %{ "v$1_sve $dst_src, $dst_src, $shift" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ $3($dst_src$$FloatRegister, __ elemType_to_regVariant(bt),
               ptrue, $shift$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// vector shift right (arithmetic)
VRSHIFT_NEON(asr, RShift, sshl)
VRSHIFT_NEON_VAR(asr, RShift, sshl)
VRSHIFT_SVE(asr, RShift, sve_asr)

// vector shift right (logical)
VRSHIFT_NEON(lsr, URShift, ushl)
VRSHIFT_NEON_VAR(lsr, URShift, ushl)
VRSHIFT_SVE(lsr, URShift, sve_lsr)

// vector shift with imm

instruct vlsl_imm(vReg dst, vReg src, immI shift) %{
  predicate(assert_not_var_shift(n));
  match(Set dst (LShiftVB src (LShiftCntV shift)));
  match(Set dst (LShiftVS src (LShiftCntV shift)));
  match(Set dst (LShiftVI src (LShiftCntV shift)));
  match(Set dst (LShiftVL src (LShiftCntV shift)));
  format %{ "vlsl_imm $dst, $src, $shift" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    int con = (int)$shift$$constant;
    if (is_subword_type(bt)) {
      // Optimize for B/S
      int esize_in_bits = type2aelembytes(bt) * BitsPerByte;
      if (con >= esize_in_bits) {
        if (VM_Version::use_neon_for_vector(length_in_bytes)) {
          __ eor($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
                 $src$$FloatRegister, $src$$FloatRegister);
        } else {
          assert(UseSVE > 0, "must be sve");
          __ sve_eor($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
        }
        return;
      }
    }
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ shl($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister, con);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_lsl($dst$$FloatRegister, __ elemType_to_regVariant(bt), $src$$FloatRegister, con);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vasr_imm(vReg dst, vReg src, immI_positive shift) %{
  predicate(assert_not_var_shift(n));
  match(Set dst (RShiftVB src (RShiftCntV shift)));
  match(Set dst (RShiftVS src (RShiftCntV shift)));
  match(Set dst (RShiftVI src (RShiftCntV shift)));
  match(Set dst (RShiftVL src (RShiftCntV shift)));
  format %{ "vasr_imm $dst, $src, $shift" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    int con = (int)$shift$$constant;
    if (is_subword_type(bt)) {
      // Refine con for B/S
      int esize_in_bits = type2aelembytes(bt) * BitsPerByte;
      if (con >= esize_in_bits) con = esize_in_bits - 1;
    }
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ sshr($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister, con);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_asr($dst$$FloatRegister, __ elemType_to_regVariant(bt), $src$$FloatRegister, con);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vlsr_imm(vReg dst, vReg src, immI_positive shift) %{
  predicate(assert_not_var_shift(n));
  match(Set dst (URShiftVB src (RShiftCntV shift)));
  match(Set dst (URShiftVS src (RShiftCntV shift)));
  match(Set dst (URShiftVI src (RShiftCntV shift)));
  match(Set dst (URShiftVL src (RShiftCntV shift)));
  format %{ "vlsr_imm $dst, $src, $shift" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    int con = (int)$shift$$constant;
    if (is_subword_type(bt)) {
      // Optimize for B/S
      int esize_in_bits = type2aelembytes(bt) * BitsPerByte;
      if (con >= esize_in_bits) {
        if (VM_Version::use_neon_for_vector(length_in_bytes)) {
          __ eor($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
                 $src$$FloatRegister, $src$$FloatRegister);
        } else {
          assert(UseSVE > 0, "must be sve");
          __ sve_eor($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
        }
        return;
      }
    }
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ ushr($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister, con);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_lsr($dst$$FloatRegister, __ elemType_to_regVariant(bt), $src$$FloatRegister, con);
    }
  %}
  ins_pipe(pipe_slow);
%}

// shift right add with imm (vector length <= 128 bits only)

instruct vasra_imm(vReg dst, vReg src, immI_positive shift) %{
  predicate(Matcher::vector_length_in_bytes(n) <= 16);
  match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift))));
  match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift))));
  match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift))));
  match(Set dst (AddVL dst (RShiftVL src (RShiftCntV shift))));
  format %{ "vasra_imm $dst, $src, $shift" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    int con = (int)$shift$$constant;
    if (is_subword_type(bt)) {
      // Refine con for B/S
      int esize_in_bits = type2aelembytes(bt) * BitsPerByte;
      if (con >= esize_in_bits) con = esize_in_bits - 1;
    }
    __ ssra($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister, con);
  %}
  ins_pipe(pipe_slow);
%}

instruct vlsra_imm(vReg dst, vReg src, immI_positive shift) %{
  predicate(Matcher::vector_length_in_bytes(n) <= 16);
  match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift))));
  match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift))));
  match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift))));
  match(Set dst (AddVL dst (URShiftVL src (RShiftCntV shift))));
  format %{ "vlsra_imm $dst, $src, $shift" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    int con = (int)$shift$$constant;
    if (is_subword_type(bt)) { // for B/H
      int esize_in_bits = type2aelembytes(bt) * BitsPerByte;
      if (con < esize_in_bits) {
        __ usra($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister, con);
      }
    } else { // for S/D
      assert(type2aelembytes(bt) == 4 || type2aelembytes(bt) == 8, "unsupported type");
      __ usra($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister, con);
    }
  %}
  ins_pipe(pipe_slow);
%}

dnl
dnl VSHIFT_PREDICATE($1,   $2,      $3  )
dnl VSHIFT_PREDICATE(type, op_name, insn)
define(`VSHIFT_PREDICATE', `
instruct v$1_masked(vReg dst_src1, vReg src2, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 ($2VB (Binary dst_src1 src2) pg));
  match(Set dst_src1 ($2VS (Binary dst_src1 src2) pg));
  match(Set dst_src1 ($2VI (Binary dst_src1 src2) pg));
  match(Set dst_src1 ($2VL (Binary dst_src1 src2) pg));
  format %{ "v$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ $3($dst_src1$$FloatRegister, __ elemType_to_regVariant(bt),
               $pg$$PRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VSHIFT_IMM_PREDICATE($1,   $2,       $3,       $4,       $5  )
dnl VSHIFT_IMM_PREDICATE(type, arg_type, op_name1, op_name2, insn)
define(`VSHIFT_IMM_PREDICATE', `
instruct v$1_imm_masked(vReg dst_src, $2 shift, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src ($3VB (Binary dst_src ($4 shift)) pg));
  match(Set dst_src ($3VS (Binary dst_src ($4 shift)) pg));
  match(Set dst_src ($3VI (Binary dst_src ($4 shift)) pg));
  match(Set dst_src ($3VL (Binary dst_src ($4 shift)) pg));
  format %{ "v$1_imm_masked $dst_src, $pg, $dst_src, $shift" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    int esize_in_bits = type2aelembytes(bt) * BitsPerByte;
    int con = (int)$shift$$constant;
    assert(con ifelse($1, lsl, >=, >) 0 && con < esize_in_bits, "invalid shift immediate");
    __ $5($dst_src$$FloatRegister, __ elemType_to_regVariant(bt),
               $pg$$PRegister, con);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// vector shift - predicated
VSHIFT_PREDICATE(lsl, LShift,  sve_lsl)
VSHIFT_PREDICATE(asr, RShift,  sve_asr)
VSHIFT_PREDICATE(lsr, URShift, sve_lsr)

// vector shift with imm - predicated
VSHIFT_IMM_PREDICATE(lsl, immI,          LShift,  LShiftCntV, sve_lsl)
VSHIFT_IMM_PREDICATE(asr, immI_positive, RShift,  RShiftCntV, sve_asr)
VSHIFT_IMM_PREDICATE(lsr, immI_positive, URShift, RShiftCntV, sve_lsr)

// ------------------------------ Vector reduction add -------------------------

dnl
dnl REDUCE_ADD_INT_NEON_SVE_PAIRWISE($1,   $2      )
dnl REDUCE_ADD_INT_NEON_SVE_PAIRWISE(type, arg_type)
define(`REDUCE_ADD_INT_NEON_SVE_PAIRWISE', `
instruct reduce_add$1_neon(iReg$1NoSp dst, $2 isrc, vReg vsrc, vReg tmp) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))));
  match(Set dst (AddReductionV$1 isrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "reduce_add$1_neon $dst, $isrc, $vsrc\t# KILL $tmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    __ neon_reduce_add_integral($dst$$Register, bt,
                                $isrc$$Register, $vsrc$$FloatRegister,
                                length_in_bytes, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct reduce_add$1_sve(iReg$1NoSp dst, $2 isrc, vReg vsrc, vRegD tmp) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))));
  match(Set dst (AddReductionV$1 isrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "reduce_add$1_sve $dst, $isrc, $vsrc\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_reduce_integral(this->ideal_Opcode(), $dst$$Register, bt,
                           $isrc$$Register, $vsrc$$FloatRegister,
                           ptrue, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// reduction addI
REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I)

// reduction addL
REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)

// reduction addF

instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
  // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
  predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
  match(Set dst (AddReductionVF fsrc vsrc));
  effect(TEMP_DEF dst);
  format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
  ins_encode %{
    __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
    __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
  // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
  predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
  match(Set dst (AddReductionVF fsrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
  ins_encode %{
    __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
    __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
    __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}
dnl
dnl REDUCE_ADD_FP_SVE($1,   $2  )
dnl REDUCE_ADD_FP_SVE(type, size)
define(`REDUCE_ADD_FP_SVE', `
// This rule calculates the reduction result in strict order. Two cases will
// reach here:
// 1. Non strictly-ordered AddReductionV$1 when vector size > 128-bits. For example -
//    AddReductionV$1 generated by Vector API. For vector size > 128-bits, it is more
//    beneficial performance-wise to generate direct SVE instruction even if it is
//    strictly ordered.
// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
//    auto-vectorization on SVE machine.
instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
            n->as_Reduction()->requires_strict_order());
  match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
  format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
REDUCE_ADD_FP_SVE(F, S)

// reduction addD

instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
  // Non-strictly ordered floating-point add reduction for doubles. This rule is
  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
  predicate(!n->as_Reduction()->requires_strict_order());
  match(Set dst (AddReductionVD dsrc vsrc));
  effect(TEMP_DEF dst);
  format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
  ins_encode %{
    __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
    __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}
REDUCE_ADD_FP_SVE(D, D)

// reduction add - predicated
dnl
dnl REDUCE_ADD_INT_PREDICATE($1,        $2     )
dnl REDUCE_ADD_INT_PREDICATE(insn_name, op_name)
define(`REDUCE_ADD_INT_PREDICATE', `
instruct reduce_add$1_masked(iReg$1NoSp dst, $2 isrc, vReg vsrc, pRegGov pg, vRegD tmp) %{
  predicate(UseSVE > 0);
  match(Set dst (AddReductionV$1 (Binary isrc vsrc) pg));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "reduce_add$1_masked $dst, $isrc, $pg, $vsrc\t# KILL $tmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    __ sve_reduce_integral(this->ideal_Opcode(), $dst$$Register, bt,
                           $isrc$$Register, $vsrc$$FloatRegister,
                           $pg$$PRegister, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_ADD_FP_PREDICATE($1,        $2     )
dnl REDUCE_ADD_FP_PREDICATE(insn_name, op_name)
define(`REDUCE_ADD_FP_PREDICATE', `
instruct reduce_add$1_masked(vReg$1 dst_src1, vReg src2, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src1 (AddReductionV$1 (Binary dst_src1 src2) pg));
  format %{ "reduce_add$1_masked $dst_src1, $pg, $dst_src1, $src2" %}
  ins_encode %{
    __ sve_fadda($dst_src1$$FloatRegister, __ $2,
                 $pg$$PRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
REDUCE_ADD_INT_PREDICATE(I, iRegIorL2I)
REDUCE_ADD_INT_PREDICATE(L, iRegL)
REDUCE_ADD_FP_PREDICATE(F, S)
REDUCE_ADD_FP_PREDICATE(D, D)

// ------------------------------ Vector reduction mul -------------------------

instruct reduce_mulI(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
                     vReg tmp1, vReg tmp2) %{
  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 8 ||
            Matcher::vector_length_in_bytes(n->in(2)) == 16);
  match(Set dst (MulReductionVI isrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
  format %{ "reduce_mulI $dst, $isrc, $vsrc\t# vector (64/128 bits). KILL $tmp1, $tmp2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    __ neon_reduce_mul_integral($dst$$Register, bt, $isrc$$Register,
                                $vsrc$$FloatRegister, length_in_bytes,
                                $tmp1$$FloatRegister, $tmp2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
  match(Set dst (MulReductionVL isrc vsrc));
  effect(TEMP_DEF dst);
  format %{ "reduce_mulL $dst, $isrc, $vsrc\t# 2L" %}
  ins_encode %{
    __ neon_reduce_mul_integral($dst$$Register, T_LONG, $isrc$$Register,
                                $vsrc$$FloatRegister, 16, fnoreg, fnoreg);
  %}
  ins_pipe(pipe_slow);
%}

instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
  predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16);
  match(Set dst (MulReductionVF fsrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    __ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
                          $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct reduce_mulD(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
  match(Set dst (MulReductionVD dsrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "reduce_mulD $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
  ins_encode %{
    __ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
                          $vsrc$$FloatRegister, 16, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

dnl
dnl REDUCE_BITWISE_OP_NEON($1,        $2       $3    $4     )
dnl REDUCE_BITWISE_OP_NEON(insn_name, is_long, type, op_name)
define(`REDUCE_BITWISE_OP_NEON', `
instruct reduce_$1$2_neon(iReg$2NoSp dst, $3 isrc, vReg vsrc) %{
  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n->in(2)) ifelse($2, L, ==, !=) T_LONG);
  match(Set dst ($4 isrc vsrc));
  effect(TEMP_DEF dst);
  format %{ "reduce_$1$2_neon $dst, $isrc, $vsrc" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    __ neon_reduce_logical(this->ideal_Opcode(), $dst$$Register, bt,
                           $isrc$$Register, $vsrc$$FloatRegister, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_BITWISE_OP_SVE($1,        $2       $3    $4     )
dnl REDUCE_BITWISE_OP_SVE(insn_name, is_long, type, op_name)
define(`REDUCE_BITWISE_OP_SVE', `
instruct reduce_$1$2_sve(iReg$2NoSp dst, $3 isrc, vReg vsrc, vRegD tmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n->in(2)) ifelse($2, L, ==, !=) T_LONG);
  match(Set dst ($4 isrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "reduce_$1$2_sve $dst, $isrc, $vsrc\t# KILL $tmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_reduce_integral(this->ideal_Opcode(), $dst$$Register, bt,
                           $isrc$$Register, $vsrc$$FloatRegister,
                           ptrue, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_BITWISE_OP_PREDICATE($1,        $2       $3    $4     )
dnl REDUCE_BITWISE_OP_PREDICATE(insn_name, is_long, type, op_name)
define(`REDUCE_BITWISE_OP_PREDICATE', `
instruct reduce_$1$2_masked(iReg$2NoSp dst, $3 isrc, vReg vsrc, pRegGov pg, vRegD tmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n->in(1)->in(2)) ifelse($2, L, ==, !=) T_LONG);
  match(Set dst ($4 (Binary isrc vsrc) pg));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "reduce_$1$2_masked $dst, $isrc, $pg, $vsrc\t# KILL $tmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    __ sve_reduce_integral(this->ideal_Opcode(), $dst$$Register, bt,
                           $isrc$$Register, $vsrc$$FloatRegister,
                           $pg$$PRegister, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// ------------------------------ Vector reduction and -------------------------

// reduction andI
REDUCE_BITWISE_OP_NEON(and, I, iRegIorL2I, AndReductionV)
REDUCE_BITWISE_OP_SVE(and, I, iRegIorL2I, AndReductionV)

// reduction andL
REDUCE_BITWISE_OP_NEON(and, L, iRegL, AndReductionV)
REDUCE_BITWISE_OP_SVE(and, L, iRegL, AndReductionV)

// reduction and - predicated
REDUCE_BITWISE_OP_PREDICATE(and, I, iRegIorL2I, AndReductionV)
REDUCE_BITWISE_OP_PREDICATE(and, L, iRegL,      AndReductionV)

// ------------------------------ Vector reduction or --------------------------

// reduction orI
REDUCE_BITWISE_OP_NEON(or, I, iRegIorL2I, OrReductionV)
REDUCE_BITWISE_OP_SVE(or, I, iRegIorL2I, OrReductionV)

// reduction orL
REDUCE_BITWISE_OP_NEON(or, L, iRegL, OrReductionV)
REDUCE_BITWISE_OP_SVE(or, L, iRegL, OrReductionV)

// reduction or - predicated
REDUCE_BITWISE_OP_PREDICATE(or, I, iRegIorL2I, OrReductionV)
REDUCE_BITWISE_OP_PREDICATE(or, L, iRegL,      OrReductionV)

// ------------------------------ Vector reduction xor -------------------------

// reduction xorI
REDUCE_BITWISE_OP_NEON(xor, I, iRegIorL2I, XorReductionV)
REDUCE_BITWISE_OP_SVE(xor, I, iRegIorL2I, XorReductionV)

// reduction xorL
REDUCE_BITWISE_OP_NEON(xor, L, iRegL, XorReductionV)
REDUCE_BITWISE_OP_SVE(xor, L, iRegL, XorReductionV)

// reduction xor - predicated
REDUCE_BITWISE_OP_PREDICATE(xor, I, iRegIorL2I, XorReductionV)
REDUCE_BITWISE_OP_PREDICATE(xor, L, iRegL,      XorReductionV)

dnl
dnl REDUCE_MAXMIN_I_NEON($1,   $2     )
dnl REDUCE_MAXMIN_I_NEON(type, op_name)
define(`REDUCE_MAXMIN_I_NEON', `
instruct reduce_$1I_neon(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
                          vReg tmp, rFlagsReg cr) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) &&
            (Matcher::vector_element_basic_type(n->in(2)) == T_BYTE ||
             Matcher::vector_element_basic_type(n->in(2)) == T_SHORT ||
             Matcher::vector_element_basic_type(n->in(2)) == T_INT));
  match(Set dst ($2 isrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp, KILL cr);
  format %{ "reduce_$1I_neon $dst, $isrc, $vsrc\t# KILL $tmp, cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    __ neon_reduce_minmax_integral(this->ideal_Opcode(), $dst$$Register, bt,
                                   $isrc$$Register, $vsrc$$FloatRegister,
                                   length_in_bytes, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_MAXMIN_I_SVE($1,   $2     )
dnl REDUCE_MAXMIN_I_SVE(type, op_name)
define(`REDUCE_MAXMIN_I_SVE', `
instruct reduce_$1I_sve(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
                         vRegD tmp, rFlagsReg cr) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) &&
            (Matcher::vector_element_basic_type(n->in(2)) == T_BYTE ||
             Matcher::vector_element_basic_type(n->in(2)) == T_SHORT ||
             Matcher::vector_element_basic_type(n->in(2)) == T_INT));
  match(Set dst ($2 isrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp, KILL cr);
  format %{ "reduce_$1I_sve $dst, $isrc, $vsrc\t# KILL $tmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_reduce_integral(this->ideal_Opcode(), $dst$$Register, bt,
                           $isrc$$Register, $vsrc$$FloatRegister,
                           ptrue, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_MAXMIN_L_NEON($1,   $2     )
dnl REDUCE_MAXMIN_L_NEON(type, op_name)
define(`REDUCE_MAXMIN_L_NEON', `
instruct reduce_$1L_neon(iRegLNoSp dst, iRegL isrc, vReg vsrc, rFlagsReg cr) %{
  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n->in(2)) == T_LONG);
  match(Set dst ($2 isrc vsrc));
  effect(TEMP_DEF dst, KILL cr);
  format %{ "reduce_$1L_neon $dst, $isrc, $vsrc\t# 2L. KILL cr" %}
  ins_encode %{
    __ neon_reduce_minmax_integral(this->ideal_Opcode(), $dst$$Register, T_LONG,
                                   $isrc$$Register, $vsrc$$FloatRegister,
                                   /* vector_length_in_bytes */ 16, fnoreg);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_MAXMIN_L_SVE($1,   $2     )
dnl REDUCE_MAXMIN_L_SVE(type, op_name)
define(`REDUCE_MAXMIN_L_SVE', `
instruct reduce_$1L_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc,
                         vRegD tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n->in(2)) == T_LONG);
  match(Set dst ($2 isrc vsrc));
  effect(TEMP_DEF dst, TEMP tmp, KILL cr);
  format %{ "reduce_$1L_sve $dst, $isrc, $vsrc\t# KILL $tmp, cr" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_reduce_integral(this->ideal_Opcode(), $dst$$Register, T_LONG,
                           $isrc$$Register, $vsrc$$FloatRegister,
                           ptrue, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_MAXMIN_F($1,   $2,      $3,    $4,    $5,    $6   )
dnl REDUCE_MAXMIN_F(type, op_name, insn1, insn2, insn3, insn4)
define(`REDUCE_MAXMIN_F', `
instruct reduce_$1F(vRegF dst, vRegF fsrc, vReg vsrc) %{
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT);
  match(Set dst ($2 fsrc vsrc));
  effect(TEMP_DEF dst);
  format %{ "reduce_$1F $dst, $fsrc, $vsrc" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      if (length_in_bytes == 8) {
        __ $3($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
      } else {
        __ $4($dst$$FloatRegister, __ T4S, $vsrc$$FloatRegister);
      }
    } else {
      assert(UseSVE > 0, "must be sve");
      assert(length_in_bytes == MaxVectorSize, "invalid vector length");
      __ $5($dst$$FloatRegister, __ S, ptrue, $vsrc$$FloatRegister);
    }
    __ $6($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_MAXMIN_D($1,   $2,      $3,    $4,    $5   )
dnl REDUCE_MAXMIN_D(type, op_name, insn1, insn2, insn3)
define(`REDUCE_MAXMIN_D', `
instruct reduce_$1D(vRegD dst, vRegD dsrc, vReg vsrc) %{
  predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE);
  match(Set dst ($2 dsrc vsrc));
  effect(TEMP_DEF dst);
  format %{ "reduce_$1D $dst, $dsrc, $vsrc" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ $3($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
    } else {
      assert(UseSVE > 0, "must be sve");
      assert(length_in_bytes == MaxVectorSize, "invalid vector length");
      __ $4($dst$$FloatRegister, __ D, ptrue, $vsrc$$FloatRegister);
    }
    __ $5($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_MAXMIN_INT_PREDICATE($1,   $2,      $3,       $4     )
dnl REDUCE_MAXMIN_INT_PREDICATE(type, is_long, arg_type, op_name)
define(`REDUCE_MAXMIN_INT_PREDICATE', `
instruct reduce_$1$2_masked(iReg$2NoSp dst, $3 isrc, vReg vsrc, pRegGov pg,
                            vRegD tmp, rFlagsReg cr) %{
  ifelse($2, I,
       `predicate(UseSVE > 0 &&
            (Matcher::vector_element_basic_type(n->in(1)->in(2)) == T_BYTE ||
             Matcher::vector_element_basic_type(n->in(1)->in(2)) == T_SHORT ||
             Matcher::vector_element_basic_type(n->in(1)->in(2)) == T_INT));',
       `predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n->in(1)->in(2)) == T_LONG);')
  match(Set dst ($4 (Binary isrc vsrc) pg));
  effect(TEMP_DEF dst, TEMP tmp, KILL cr);
  format %{ "reduce_$1$2_masked $dst, $isrc, $pg, $vsrc\t# KILL $tmp, cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
    __ sve_reduce_integral(this->ideal_Opcode(), $dst$$Register, bt,
                           $isrc$$Register, $vsrc$$FloatRegister,
                           $pg$$PRegister, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REDUCE_MAXMIN_FP_PREDICATE($1,   $2,       $3,       $4,      $5,    $6   )
dnl REDUCE_MAXMIN_FP_PREDICATE(type, is_float, arg_name, op_name, insn1, insn2)
define(`REDUCE_MAXMIN_FP_PREDICATE', `
instruct reduce_$1$2_masked(vReg$2 dst, vReg$2 $3, vReg vsrc, pRegGov pg) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n->in(1)->in(2)) == ifelse($2, F, T_FLOAT, T_DOUBLE));
  match(Set dst ($4 (Binary $3 vsrc) pg));
  effect(TEMP_DEF dst);
  format %{ "reduce_$1$2_masked $dst, $$3, $pg, $vsrc" %}
  ins_encode %{
    __ $5($dst$$FloatRegister, __ ifelse($2, F, S, D), $pg$$PRegister, $vsrc$$FloatRegister);
    __ $6($dst$$FloatRegister, $dst$$FloatRegister, $$3$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// ------------------------------ Vector reduction max -------------------------

// reduction maxI
REDUCE_MAXMIN_I_NEON(max, MaxReductionV)
REDUCE_MAXMIN_I_SVE(max, MaxReductionV)

// reduction maxL
REDUCE_MAXMIN_L_NEON(max, MaxReductionV)
REDUCE_MAXMIN_L_SVE(max, MaxReductionV)

// reduction maxF
REDUCE_MAXMIN_F(max, MaxReductionV, fmaxp, fmaxv, sve_fmaxv, fmaxs)

// reduction maxD
REDUCE_MAXMIN_D(max, MaxReductionV, fmaxp, sve_fmaxv, fmaxd)

// reduction max - predicated
REDUCE_MAXMIN_INT_PREDICATE(max, I, iRegIorL2I, MaxReductionV)
REDUCE_MAXMIN_INT_PREDICATE(max, L, iRegL,      MaxReductionV)
REDUCE_MAXMIN_FP_PREDICATE(max, F, fsrc, MaxReductionV, sve_fmaxv, fmaxs)
REDUCE_MAXMIN_FP_PREDICATE(max, D, dsrc, MaxReductionV, sve_fmaxv, fmaxd)

// ------------------------------ Vector reduction min -------------------------

// reduction minI
REDUCE_MAXMIN_I_NEON(min, MinReductionV)
REDUCE_MAXMIN_I_SVE(min, MinReductionV)

// reduction minL
REDUCE_MAXMIN_L_NEON(min, MinReductionV)
REDUCE_MAXMIN_L_SVE(min, MinReductionV)

// reduction minF
REDUCE_MAXMIN_F(min, MinReductionV, fminp, fminv, sve_fminv, fmins)

// reduction minD
REDUCE_MAXMIN_D(min, MinReductionV, fminp, sve_fminv, fmind)

// reduction min - predicated
REDUCE_MAXMIN_INT_PREDICATE(min, I, iRegIorL2I, MinReductionV)
REDUCE_MAXMIN_INT_PREDICATE(min, L, iRegL,      MinReductionV)
REDUCE_MAXMIN_FP_PREDICATE(min, F, fsrc, MinReductionV, sve_fminv, fmins)
REDUCE_MAXMIN_FP_PREDICATE(min, D, dsrc, MinReductionV, sve_fminv, fmind)

// ------------------------------ Vector reinterpret ---------------------------

instruct reinterpret_same_size(vReg dst_src) %{
  predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
  match(Set dst_src (VectorReinterpret dst_src));
  ins_cost(0);
  format %{ "reinterpret_same_size $dst_src\t# do nothing" %}
  ins_encode(/* empty encoding */);
  ins_pipe(pipe_class_empty);
%}

instruct reinterpret_resize_le128b(vReg dst, vReg src) %{
  predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)) &&
            Matcher::vector_length_in_bytes(n) <= 16 &&
            Matcher::vector_length_in_bytes(n->in(1)) <= 16);
  match(Set dst (VectorReinterpret src));
  format %{ "reinterpret_resize_le128b $dst, $src\t# vector <= 128 bits." %}
  ins_encode %{
    uint length_in_bytes_src = Matcher::vector_length_in_bytes(this, $src);
    uint length_in_bytes_dst = Matcher::vector_length_in_bytes(this);
    // The higher bits in "dst" register must be cleared to zero.
    if ((length_in_bytes_src == 4 && length_in_bytes_dst == 8) ||
        (length_in_bytes_src == 8 && length_in_bytes_dst == 4)) {
      // Reinterpret between 32 bits and 64 bits
      __ dup($dst$$FloatRegister, __ S, $src$$FloatRegister);
    } else if ((length_in_bytes_src == 4 && length_in_bytes_dst == 16) ||
               (length_in_bytes_src == 16 && length_in_bytes_dst == 4)) {
      // Reinterpret between 32 bits and 128 bits
      __ dup($dst$$FloatRegister, __ S, $src$$FloatRegister);
    } else if ((length_in_bytes_src == 8 && length_in_bytes_dst == 16) ||
               (length_in_bytes_src == 16 && length_in_bytes_dst == 8)) {
      // Reinterpret between 64 bits and 128 bits
      __ orr($dst$$FloatRegister, __ T8B, $src$$FloatRegister, $src$$FloatRegister);
    } else {
      assert(false, "invalid vector length");
      ShouldNotReachHere();
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct reinterpret_resize_gt128b(vReg dst, vReg src, pReg ptmp, rFlagsReg cr) %{
  predicate(Matcher::vector_length_in_bytes(n) != Matcher::vector_length_in_bytes(n->in(1)) &&
            (Matcher::vector_length_in_bytes(n) > 16 ||
             Matcher::vector_length_in_bytes(n->in(1)) > 16));
  match(Set dst (VectorReinterpret src));
  effect(TEMP_DEF dst, TEMP ptmp, KILL cr);
  format %{ "reinterpret_resize_gt128b $dst, $src\t# vector > 128 bits. KILL $ptmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    uint length_in_bytes_src = Matcher::vector_length_in_bytes(this, $src);
    uint length_in_bytes_dst = Matcher::vector_length_in_bytes(this);
    uint length_in_bytes_resize = length_in_bytes_src < length_in_bytes_dst ?
                                  length_in_bytes_src : length_in_bytes_dst;
    assert(length_in_bytes_src <= MaxVectorSize && length_in_bytes_dst <= MaxVectorSize,
           "invalid vector length");
    __ sve_gen_mask_imm($ptmp$$PRegister, T_BYTE, length_in_bytes_resize);
    __ sve_dup($dst$$FloatRegister, __ B, 0);
    __ sve_sel($dst$$FloatRegister, __ B, $ptmp$$PRegister,
               $src$$FloatRegister, $dst$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ---------------------------- Vector zero extend --------------------------------
dnl VECTOR_ZERO_EXTEND($1,      $2,     $3,       $4,        $5,         )
dnl VECTOR_ZERO_EXTEND(op_name, src_bt, src_size, assertion, neon_comment)
define(`VECTOR_ZERO_EXTEND', `
instruct vzeroExt$1toX(vReg dst, vReg src) %{
  match(Set dst (VectorUCast`$1'2X src));
  format %{ "vzeroExt$1toX $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    assert($4, "must be");
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      // $5
      __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
                            $src$$FloatRegister, $2, /* is_unsigned */ true);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                           $src$$FloatRegister, __ $3, /* is_unsigned */ true);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
VECTOR_ZERO_EXTEND(B, T_BYTE,  B, bt == T_SHORT || bt == T_INT || bt == T_LONG, `4B to 4S/4I, 8B to 8S')
VECTOR_ZERO_EXTEND(S, T_SHORT, H, bt == T_INT || bt == T_LONG,                  `2S to 2I/2L, 4S to 4I')
VECTOR_ZERO_EXTEND(I, T_INT,   S, bt == T_LONG,                                 `2I to 2L')

// ------------------------------ Vector cast ----------------------------------

// VectorCastB2X

instruct vcvtBtoX(vReg dst, vReg src) %{
  match(Set dst (VectorCastB2X src));
  format %{ "vcvtBtoX $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      // 4B to 4S/4I/4F, 8B to 8S
      __ neon_vector_extend($dst$$FloatRegister, bt == T_FLOAT ? T_INT : bt,
                            length_in_bytes, $src$$FloatRegister, T_BYTE);
      if (bt == T_FLOAT) {
        __ scvtfv(__ T4S, $dst$$FloatRegister, $dst$$FloatRegister);
      }
    } else {
      assert(UseSVE > 0, "must be sve");
      Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
      __ sve_vector_extend($dst$$FloatRegister, size, $src$$FloatRegister, __ B);
      if (is_floating_point_type(bt)) {
        __ sve_scvtf($dst$$FloatRegister, size, ptrue, $dst$$FloatRegister, size);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// VectorCastS2X

instruct vcvtStoB_neon(vReg dst, vReg src) %{
  predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastS2X src));
  format %{ "vcvtStoB_neon $dst, $src" %}
  ins_encode %{
    // 4S/8S to 4B/8B
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    __ neon_vector_narrow($dst$$FloatRegister, T_BYTE,
                          $src$$FloatRegister, T_SHORT, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtStoB_sve(vReg dst, vReg src, vReg tmp) %{
  predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
            !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastS2X src));
  effect(TEMP tmp);
  format %{ "vcvtStoB_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_vector_narrow($dst$$FloatRegister, __ B,
                         $src$$FloatRegister, __ H, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtStoX_extend(vReg dst, vReg src) %{
  predicate(type2aelembytes(Matcher::vector_element_basic_type(n)) >= 4);
  match(Set dst (VectorCastS2X src));
  format %{ "vcvtStoX_extend $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      if (is_floating_point_type(bt)) {
        // 2S to 2F/2D, 4S to 4F
        __ neon_vector_extend($dst$$FloatRegister, bt == T_FLOAT ? T_INT : T_LONG,
                              length_in_bytes, $src$$FloatRegister, T_SHORT);
        __ scvtfv(get_arrangement(this), $dst$$FloatRegister, $dst$$FloatRegister);
      } else {
        // 2S to 2I/2L, 4S to 4I
        __ neon_vector_extend($dst$$FloatRegister, bt, length_in_bytes,
                              $src$$FloatRegister, T_SHORT);
      }
    } else {
      assert(UseSVE > 0, "must be sve");
      Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
      __ sve_vector_extend($dst$$FloatRegister, size, $src$$FloatRegister, __ H);
      if (is_floating_point_type(bt)) {
        __ sve_scvtf($dst$$FloatRegister, size, ptrue, $dst$$FloatRegister, size);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// VectorCastI2X

instruct vcvtItoX_narrow_neon(vReg dst, vReg src) %{
  predicate((Matcher::vector_element_basic_type(n) == T_BYTE ||
             Matcher::vector_element_basic_type(n) == T_SHORT) &&
            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastI2X src));
  effect(TEMP_DEF dst);
  format %{ "vcvtItoX_narrow_neon $dst, $src" %}
  ins_encode %{
    // 2I to 2S, 4I to 4B/4S
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    __ neon_vector_narrow($dst$$FloatRegister, bt,
                          $src$$FloatRegister, T_INT, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtItoX_narrow_sve(vReg dst, vReg src, vReg tmp) %{
  predicate((Matcher::vector_element_basic_type(n) == T_BYTE ||
             Matcher::vector_element_basic_type(n) == T_SHORT) &&
            !VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastI2X src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtItoX_narrow_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_vector_narrow($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                         $src$$FloatRegister, __ S, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtItoX(vReg dst, vReg src) %{
  predicate(type2aelembytes(Matcher::vector_element_basic_type(n)) >= 4);
  match(Set dst (VectorCastI2X src));
  format %{ "vcvtItoX $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes =  Matcher::vector_length_in_bytes(this);
    if (bt == T_FLOAT) {
      if (VM_Version::use_neon_for_vector(length_in_bytes)) {
        // 2I/4I to 2F/4F
        __ scvtfv(get_arrangement(this), $dst$$FloatRegister, $src$$FloatRegister);
      } else {
        assert(UseSVE > 0, "must be sve");
        __ sve_scvtf($dst$$FloatRegister, __ S, ptrue, $src$$FloatRegister, __ S);
      }
    } else {
      assert(type2aelembytes(bt) == 8, "unsupported type");
      if (VM_Version::use_neon_for_vector(length_in_bytes)) {
        // 2I to 2L/2D
        __ neon_vector_extend($dst$$FloatRegister, T_LONG, length_in_bytes,
                              $src$$FloatRegister, T_INT);
        if (bt == T_DOUBLE) {
          __ scvtfv(__ T2D, $dst$$FloatRegister, $dst$$FloatRegister);
        }
      } else {
        assert(UseSVE > 0, "must be sve");
        __ sve_vector_extend($dst$$FloatRegister, __ D, $src$$FloatRegister, __ S);
        if (bt == T_DOUBLE) {
          __ sve_scvtf($dst$$FloatRegister, __ D, ptrue, $dst$$FloatRegister, __ D);
        }
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// VectorCastL2X

instruct vcvtLtoX_narrow_neon(vReg dst, vReg src) %{
  predicate((Matcher::vector_element_basic_type(n) == T_INT ||
             Matcher::vector_element_basic_type(n) == T_SHORT) &&
            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastL2X src));
  format %{ "vcvtLtoX_narrow_neon $dst, $src" %}
  ins_encode %{
    // 2L to 2S/2I
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    __ neon_vector_narrow($dst$$FloatRegister, bt,
                          $src$$FloatRegister, T_LONG, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtLtoX_narrow_sve(vReg dst, vReg src, vReg tmp) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))) &&
            !is_floating_point_type(Matcher::vector_element_basic_type(n)) &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 4);
  match(Set dst (VectorCastL2X src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtLtoX_narrow_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_vector_narrow($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                         $src$$FloatRegister, __ D, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtLtoF_neon(vReg dst, vReg src, vRegF tmp) %{
  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
  match(Set dst (VectorCastL2X src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtLtoF_neon $dst, $src\t# 2L to 2F. KILL $tmp" %}
  ins_encode %{
    // 2L to 2F
    __ umov(rscratch1, $src$$FloatRegister, __ D, 0);
    __ scvtfs($dst$$FloatRegister, rscratch1);
    __ umov(rscratch1, $src$$FloatRegister, __ D, 1);
    __ scvtfs($tmp$$FloatRegister, rscratch1);
    __ ins($dst$$FloatRegister, __ S, $tmp$$FloatRegister, 1, 0);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtLtoF_sve(vReg dst, vReg src, vReg tmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_FLOAT);
  match(Set dst (VectorCastL2X src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtLtoF_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    __ sve_scvtf($dst$$FloatRegister, __ S, ptrue, $src$$FloatRegister, __ D);
    __ sve_vector_narrow($dst$$FloatRegister, __ S,
                         $dst$$FloatRegister, __ D, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtLtoD(vReg dst, vReg src) %{
  predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
  match(Set dst (VectorCastL2X src));
  format %{ "vcvtLtoD $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      // 2L to 2D
      __ scvtfv(__ T2D, $dst$$FloatRegister, $src$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_scvtf($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister, __ D);
    }
  %}
  ins_pipe(pipe_slow);
%}

// VectorCastF2X

instruct vcvtFtoX_narrow_neon(vReg dst, vReg src) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))) &&
            (Matcher::vector_element_basic_type(n) == T_BYTE ||
             Matcher::vector_element_basic_type(n) == T_SHORT));
  match(Set dst (VectorCastF2X src));
  effect(TEMP_DEF dst);
  format %{ "vcvtFtoX_narrow_neon $dst, $src" %}
  ins_encode %{
    // 2F to 2S, 4F to 4B/4S
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    __ fcvtzs($dst$$FloatRegister, length_in_bytes == 16 ? __ T4S : __ T2S,
              $src$$FloatRegister);
    __ neon_vector_narrow($dst$$FloatRegister, bt,
                          $dst$$FloatRegister, T_INT, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtFtoX_narrow_sve(vReg dst, vReg src, vReg tmp) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))) &&
            (Matcher::vector_element_basic_type(n) == T_BYTE ||
             Matcher::vector_element_basic_type(n) == T_SHORT));
  match(Set dst (VectorCastF2X src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtFtoX_narrow_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fcvtzs($dst$$FloatRegister, __ S, ptrue, $src$$FloatRegister, __ S);
    __ sve_vector_narrow($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                         $dst$$FloatRegister, __ S, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtFtoX(vReg dst, vReg src) %{
  predicate(type2aelembytes(Matcher::vector_element_basic_type(n)) >= 4);
  match(Set dst (VectorCastF2X src));
  format %{ "vcvtFtoX $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes =  Matcher::vector_length_in_bytes(this);
    if (bt == T_INT) {
      if (VM_Version::use_neon_for_vector(length_in_bytes)) {
        // 2F/4F to 2I/4I
        __ fcvtzs($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
      } else {
        assert(UseSVE > 0, "must be sve");
        __ sve_fcvtzs($dst$$FloatRegister, __ S, ptrue, $src$$FloatRegister, __ S);
      }
    } else if (bt == T_LONG) {
      if (UseSVE == 0) {
        // 2F to 2L
        __ fcvtl($dst$$FloatRegister, __ T2D, $src$$FloatRegister, __ T2S);
        __ fcvtzs($dst$$FloatRegister, __ T2D, $dst$$FloatRegister);
      } else {
        __ sve_vector_extend($dst$$FloatRegister, __ D, $src$$FloatRegister, __ S);
        __ sve_fcvtzs($dst$$FloatRegister, __ D, ptrue, $dst$$FloatRegister, __ S);
      }
    } else {
      assert(bt == T_DOUBLE, "unsupported type");
      if (length_in_bytes == 16) {
        // 2F to 2D
        __ fcvtl($dst$$FloatRegister, __ T2D, $src$$FloatRegister, __ T2S);
      } else {
        assert(UseSVE > 0 && length_in_bytes > 16, "must be");
        __ sve_vector_extend($dst$$FloatRegister, __ D, $src$$FloatRegister, __ S);
        __ sve_fcvt($dst$$FloatRegister, __ D, ptrue, $dst$$FloatRegister, __ S);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// VectorCastD2X

instruct vcvtDtoI_neon(vReg dst, vReg src) %{
  predicate(UseSVE == 0 &&
            (Matcher::vector_element_basic_type(n) == T_INT ||
             Matcher::vector_element_basic_type(n) == T_SHORT));
  match(Set dst (VectorCastD2X src));
  effect(TEMP_DEF dst);
  format %{ "vcvtDtoI_neon $dst, $src\t# 2D to 2S/2I" %}
  ins_encode %{
    // 2D to 2S/2I
    __ ins($dst$$FloatRegister, __ D, $src$$FloatRegister, 0, 1);
    // We can't use fcvtzs(vector, integer) instruction here because we need
    // saturation arithmetic. See JDK-8276151.
    __ fcvtzdw(rscratch1, $src$$FloatRegister);
    __ fcvtzdw(rscratch2, $dst$$FloatRegister);
    __ fmovs($dst$$FloatRegister, rscratch1);
    __ mov($dst$$FloatRegister, __ S, 1, rscratch2);
    if (Matcher::vector_element_basic_type(this) == T_SHORT) {
      __ neon_vector_narrow($dst$$FloatRegister, T_SHORT,
                            $dst$$FloatRegister, T_INT, 8);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtDtoI_sve(vReg dst, vReg src, vReg tmp) %{
  predicate(UseSVE > 0 &&
            (Matcher::vector_element_basic_type(n) == T_BYTE ||
             Matcher::vector_element_basic_type(n) == T_SHORT ||
             Matcher::vector_element_basic_type(n) == T_INT));
  match(Set dst (VectorCastD2X src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtDtoI_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_fcvtzs($dst$$FloatRegister, __ S, ptrue, $src$$FloatRegister, __ D);
    __ sve_vector_narrow($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                         $dst$$FloatRegister, __ D, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtDtoL(vReg dst, vReg src) %{
  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
  match(Set dst (VectorCastD2X src));
  format %{ "vcvtDtoL $dst, $src" %}
  ins_encode %{
    uint length_in_bytes =  Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      // 2D to 2L
      __ fcvtzs($dst$$FloatRegister, __ T2D, $src$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_fcvtzs($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister, __ D);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtDtoF_64b(vReg dst, vReg src) %{
  predicate(Matcher::vector_element_basic_type(n) == T_FLOAT &&
            Matcher::vector_length_in_bytes(n) == 8);
  match(Set dst (VectorCastD2X src));
  format %{ "vcvtDtoF_64b $dst, $src\t# 2D to 2F" %}
  ins_encode %{
    // 2D to 2F
    __ fcvtn($dst$$FloatRegister, __ T2S, $src$$FloatRegister, __ T2D);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtDtoF_gt64b(vReg dst, vReg src, vReg tmp) %{
  predicate(Matcher::vector_element_basic_type(n) == T_FLOAT &&
            Matcher::vector_length_in_bytes(n) > 8);
  match(Set dst (VectorCastD2X src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtDtoF_gt64b $dst, $src\t# vector > 64 bits. KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_fcvt($dst$$FloatRegister, __ S, ptrue, $src$$FloatRegister, __ D);
    __ sve_vector_narrow($dst$$FloatRegister, __ S,
                         $dst$$FloatRegister, __ D, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// VectorCastHF2F

instruct vcvtHFtoF(vReg dst, vReg src) %{
  match(Set dst (VectorCastHF2F src));
  format %{ "vcvtHFtoF $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      // 2HF to 2F, 4HF to 4F
      __ fcvtl($dst$$FloatRegister, __ T4S, $src$$FloatRegister, __ T4H);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_vector_extend($dst$$FloatRegister, __ S, $src$$FloatRegister, __ H);
      __ sve_fcvt($dst$$FloatRegister, __ S, ptrue, $dst$$FloatRegister, __ H);
    }
  %}
  ins_pipe(pipe_slow);
%}

// VectorCastF2HF

instruct vcvtFtoHF_neon(vReg dst, vReg src) %{
  predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastF2HF src));
  format %{ "vcvtFtoHF_neon $dst, $src\t# 2F/4F to 2HF/4HF" %}
  ins_encode %{
    // 2F to 2HF, 4F to 4HF
    __ fcvtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcvtFtoHF_sve(vReg dst, vReg src, vReg tmp) %{
  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(1))));
  match(Set dst (VectorCastF2HF src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vcvtFtoHF_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_fcvt($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister, __ S);
    __ sve_vector_narrow($dst$$FloatRegister, __ H,
                         $dst$$FloatRegister, __ S, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Replicate ------------------------------------

dnl REPLICATE_INT($1,   $2,       $3  )
dnl REPLICATE_INT(type, arg_type, size)
define(`REPLICATE_INT', `
instruct replicate$1(vReg dst, $2 src) %{
  match(Set dst (Replicate src));
  format %{ "replicate$1 $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ dup($dst$$FloatRegister, get_arrangement(this), $src$$Register);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_dup($dst$$FloatRegister, get_reg_variant(this), $src$$Register);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl REPLICATE_FP($1,   $2  )
dnl REPLICATE_FP(type, size)
define(`REPLICATE_FP', `
instruct replicate$1(vReg dst, vReg$1 src) %{
  match(Set dst (Replicate src));
  format %{ "replicate$1 $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ dup($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_cpy($dst$$FloatRegister, __ $2, ptrue, $src$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// replicate from reg
REPLICATE_INT(I, iRegIorL2I)
REPLICATE_INT(L, iRegL)
REPLICATE_FP(F, S, T_FLOAT )
REPLICATE_FP(D, D, T_DOUBLE)

// Replicate a half-precision float value held in a floating point register
instruct replicateHF(vReg dst, vRegF src) %{
  predicate(Matcher::vector_element_basic_type(n) == T_SHORT);
  match(Set dst (Replicate src));
  format %{ "replicateHF $dst, $src\t# replicate half-precision float" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ dup($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
    } else { // length_in_bytes must be > 16 and SVE should be enabled
      assert(UseSVE > 0, "must be sve");
      __ sve_cpy($dst$$FloatRegister, __ H, ptrue, $src$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

// replicate from imm

instruct replicateI_imm_le128b(vReg dst, immI con) %{
  predicate(Matcher::vector_length_in_bytes(n) <= 16 &&
            Matcher::is_non_long_integral_vector(n));
  match(Set dst (Replicate con));
  format %{ "replicateI_imm_le128b $dst, $con\t# vector <= 128 bits" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    int imm = (int)$con$$constant;
    if (type2aelembytes(bt) == 1) {
      // Refine imm for B
      imm = imm & 0xff;
    } else if (type2aelembytes(bt) == 2) {
      // Refine imm for S
      imm = imm & 0xffff;
    }
    __ mov($dst$$FloatRegister, get_arrangement(this), imm);
  %}
  ins_pipe(pipe_slow);
%}

instruct replicateB_imm8_gt128b(vReg dst, immI8 con) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16 &&
            Matcher::vector_element_basic_type(n) == T_BYTE);
  match(Set dst (Replicate con));
  format %{ "replicateB_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_dup($dst$$FloatRegister, __ B, (int)($con$$constant));
  %}
  ins_pipe(pipe_slow);
%}

instruct replicateI_imm8_gt128b(vReg dst, immIDupV con) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16 &&
            (Matcher::vector_element_basic_type(n) == T_SHORT ||
             Matcher::vector_element_basic_type(n) == T_INT));
  match(Set dst (Replicate con));
  format %{ "replicateI_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_dup($dst$$FloatRegister, get_reg_variant(this), (int)($con$$constant));
  %}
  ins_pipe(pipe_slow);
%}

instruct replicateL_imm_128b(vReg dst, immL con) %{
  predicate(Matcher::vector_length_in_bytes(n) == 16);
  match(Set dst (Replicate con));
  format %{ "replicateL_imm_128b $dst, $con\t# vector > 128 bits" %}
  ins_encode %{
    __ mov($dst$$FloatRegister, __ T2D, (uint64_t)($con$$constant));
  %}
  ins_pipe(pipe_slow);
%}

instruct replicateL_imm8_gt128b(vReg dst, immLDupV con) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16);
  match(Set dst (Replicate con));
  format %{ "replicateL_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_dup($dst$$FloatRegister, __ D, (int)($con$$constant));
  %}
  ins_pipe(pipe_slow);
%}

// Replicate an immediate 16-bit half precision float value
instruct replicateHF_imm_le128b(vReg dst, immH con) %{
  predicate(Matcher::vector_length_in_bytes(n) <= 16);
  match(Set dst (Replicate con));
  format %{ "replicateHF_imm_le128b $dst, $con\t# vector <= 128 bits" %}
  ins_encode %{
    int imm = (int)($con$$constant) & 0xffff;
    __ mov($dst$$FloatRegister, get_arrangement(this), imm);
  %}
  ins_pipe(pipe_slow);
%}

// Replicate a 16-bit half precision float which is within the limits
// for the operand - immHDupV
instruct replicateHF_imm8_gt128b(vReg dst, immHDupV con) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16);
  match(Set dst (Replicate con));
  format %{ "replicateHF_imm8_gt128b $dst, $con\t# vector > 128 bits" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_dup($dst$$FloatRegister, __ H, (int)($con$$constant));
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector insert --------------------------------

// BYTE, SHORT, INT

instruct insertI_le128b(vReg dst, vReg src, iRegIorL2I val, immI idx) %{
  predicate(Matcher::vector_length_in_bytes(n) <= 16 &&
            (Matcher::vector_element_basic_type(n) == T_BYTE ||
             Matcher::vector_element_basic_type(n) == T_SHORT ||
             Matcher::vector_element_basic_type(n) == T_INT));
  match(Set dst (VectorInsert (Binary src val) idx));
  format %{ "insertI_le128b $dst, $src, $val, $idx\t# vector <= 128 bits" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if ($dst$$FloatRegister != $src$$FloatRegister) {
      __ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
             $src$$FloatRegister, $src$$FloatRegister);
    }
    __ mov($dst$$FloatRegister, __ elemType_to_regVariant(bt),
           (int)($idx$$constant), $val$$Register);
  %}
  ins_pipe(pipe_slow);
%}

instruct insertI_index_lt32(vReg dst, vReg src, iRegIorL2I val, immI idx,
                            vReg tmp, pRegGov pgtmp, rFlagsReg cr) %{
  predicate(n->in(2)->get_int() < 32 &&
            Matcher::vector_length_in_bytes(n) > 16 &&
            (Matcher::vector_element_basic_type(n) == T_BYTE ||
             Matcher::vector_element_basic_type(n) == T_SHORT ||
             Matcher::vector_element_basic_type(n) == T_INT));
  match(Set dst (VectorInsert (Binary src val) idx));
  effect(TEMP tmp, TEMP pgtmp, KILL cr);
  format %{ "insertI_index_lt32 $dst, $src, $val, $idx\t# vector > 128 bits, index < 31. KILL $tmp, $pgtmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_index($tmp$$FloatRegister, size, -16, 1);
    __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, size, ptrue,
               $tmp$$FloatRegister, (int)($idx$$constant) - 16);
    if ($dst$$FloatRegister != $src$$FloatRegister) {
      __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
    }
    __ sve_cpy($dst$$FloatRegister, size, $pgtmp$$PRegister, $val$$Register);
  %}
  ins_pipe(pipe_slow);
%}

instruct insertI_index_ge32(vReg dst, vReg src, iRegIorL2I val, immI idx, vReg tmp1,
                            vReg tmp2, pRegGov pgtmp, rFlagsReg cr) %{
  predicate(n->in(2)->get_int() >= 32 &&
            (Matcher::vector_element_basic_type(n) == T_BYTE ||
             Matcher::vector_element_basic_type(n) == T_SHORT ||
             Matcher::vector_element_basic_type(n) == T_INT));
  match(Set dst (VectorInsert (Binary src val) idx));
  effect(TEMP tmp1, TEMP tmp2, TEMP pgtmp, KILL cr);
  format %{ "insertI_index_ge32 $dst, $src, $val, $idx\t# index >= 32. KILL $tmp1, $tmp2, $pgtmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_index($tmp1$$FloatRegister, size, 0, 1);
    __ sve_dup($tmp2$$FloatRegister, size, (int)($idx$$constant));
    __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, size, ptrue,
               $tmp1$$FloatRegister, $tmp2$$FloatRegister);
    if ($dst$$FloatRegister != $src$$FloatRegister) {
      __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
    }
    __ sve_cpy($dst$$FloatRegister, size, $pgtmp$$PRegister, $val$$Register);
  %}
  ins_pipe(pipe_slow);
%}

// LONG

instruct insertL_128b(vReg dst, vReg src, iRegL val, immI idx) %{
  predicate(Matcher::vector_length_in_bytes(n) == 16 &&
            Matcher::vector_element_basic_type(n) == T_LONG);
  match(Set dst (VectorInsert (Binary src val) idx));
  format %{ "insertL_128b $dst, $src, $val, $idx\t# 2L" %}
  ins_encode %{
    if ($dst$$FloatRegister != $src$$FloatRegister) {
      __ orr($dst$$FloatRegister, __ T16B, $src$$FloatRegister, $src$$FloatRegister);
    }
    __ mov($dst$$FloatRegister, __ D, (int)($idx$$constant), $val$$Register);
  %}
  ins_pipe(pipe_slow);
%}

instruct insertL_gt128b(vReg dst, vReg src, iRegL val, immI idx,
                        vReg tmp, pRegGov pgtmp, rFlagsReg cr) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16 &&
            Matcher::vector_element_basic_type(n) == T_LONG);
  match(Set dst (VectorInsert (Binary src val) idx));
  effect(TEMP tmp, TEMP pgtmp, KILL cr);
  format %{ "insertL_gt128b $dst, $src, $val, $idx\t# vector > 128 bits. KILL $tmp, $pgtmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_index($tmp$$FloatRegister, __ D, -16, 1);
    __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ D, ptrue,
               $tmp$$FloatRegister, (int)($idx$$constant) - 16);
    if ($dst$$FloatRegister != $src$$FloatRegister) {
      __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
    }
    __ sve_cpy($dst$$FloatRegister, __ D, $pgtmp$$PRegister, $val$$Register);
  %}
  ins_pipe(pipe_slow);
%}

// FLOAT

instruct insertF_le128b(vReg dst, vReg src, vRegF val, immI idx) %{
  predicate(Matcher::vector_length_in_bytes(n) <= 16 &&
            Matcher::vector_element_basic_type(n) == T_FLOAT);
  match(Set dst (VectorInsert (Binary src val) idx));
  effect(TEMP_DEF dst);
  format %{ "insertF_le128b $dst, $src, $val, $idx\t# vector <= 128 bits" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if ($dst$$FloatRegister != $src$$FloatRegister) {
      __ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
             $src$$FloatRegister, $src$$FloatRegister);
    }
    __ ins($dst$$FloatRegister, __ S, $val$$FloatRegister, (int)($idx$$constant), 0);
  %}
  ins_pipe(pipe_slow);
%}

instruct insertF_index_lt32(vReg dst, vReg src, vRegF val, immI idx,
                            pRegGov pgtmp, rFlagsReg cr) %{
  predicate(n->in(2)->get_int() < 32 &&
            Matcher::vector_length_in_bytes(n) > 16 &&
            Matcher::vector_element_basic_type(n) == T_FLOAT);
  match(Set dst (VectorInsert (Binary src val) idx));
  effect(TEMP_DEF dst, TEMP pgtmp, KILL cr);
  format %{ "insertF_index_lt32 $dst, $src, $val, $idx\t# vector > 128 bits, index < 32. KILL $pgtmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_index($dst$$FloatRegister, __ S, -16, 1);
    __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ S, ptrue,
               $dst$$FloatRegister, (int)($idx$$constant) - 16);
    __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
    __ sve_cpy($dst$$FloatRegister, __ S, $pgtmp$$PRegister, $val$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct insertF_index_ge32(vReg dst, vReg src, vRegF val, immI idx, vReg tmp,
                            pRegGov pgtmp, rFlagsReg cr) %{
  predicate(n->in(2)->get_int() >= 32 &&
            Matcher::vector_element_basic_type(n) == T_FLOAT);
  match(Set dst (VectorInsert (Binary src val) idx));
  effect(TEMP_DEF dst, TEMP tmp, TEMP pgtmp, KILL cr);
  format %{ "insertF_index_ge32 $dst, $src, $val, $idx\t# index >= 32. KILL $tmp, $pgtmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_index($tmp$$FloatRegister, __ S, 0, 1);
    __ sve_dup($dst$$FloatRegister, __ S, (int)($idx$$constant));
    __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ S, ptrue,
               $tmp$$FloatRegister, $dst$$FloatRegister);
    __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
    __ sve_cpy($dst$$FloatRegister, __ S, $pgtmp$$PRegister, $val$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// DOUBLE

instruct insertD_128b(vReg dst, vReg src, vRegD val, immI idx) %{
  predicate(Matcher::vector_length_in_bytes(n) == 16 &&
            Matcher::vector_element_basic_type(n) == T_DOUBLE);
  match(Set dst (VectorInsert (Binary src val) idx));
  effect(TEMP_DEF dst);
  format %{ "insertD_128b $dst, $src, $val, $idx\t# 2D" %}
  ins_encode %{
    if ($dst$$FloatRegister != $src$$FloatRegister) {
      __ orr($dst$$FloatRegister, __ T16B, $src$$FloatRegister, $src$$FloatRegister);
    }
    __ ins($dst$$FloatRegister, __ D, $val$$FloatRegister, (int)($idx$$constant), 0);
  %}
  ins_pipe(pipe_slow);
%}

instruct insertD_gt128b(vReg dst, vReg src, vRegD val, immI idx,
                        pRegGov pgtmp, rFlagsReg cr) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16 &&
            Matcher::vector_element_basic_type(n) == T_DOUBLE);
  match(Set dst (VectorInsert (Binary src val) idx));
  effect(TEMP_DEF dst, TEMP pgtmp, KILL cr);
  format %{ "insertD_gt128b $dst, $src, $val, $idx\t# vector > 128 bits. KILL $pgtmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_index($dst$$FloatRegister, __ D, -16, 1);
    __ sve_cmp(Assembler::EQ, $pgtmp$$PRegister, __ D, ptrue,
               $dst$$FloatRegister, (int)($idx$$constant) - 16);
    __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
    __ sve_cpy($dst$$FloatRegister, __ D, $pgtmp$$PRegister, $val$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Extract --------------------------------------
dnl
dnl EXTRACT_INT_SMALL($1,   $2,    $3,       $4,   $5  )
dnl EXTRACT_INT_SMALL(type, index, arg_type, insn, size)
define(`EXTRACT_INT_SMALL', `
instruct extract$1_index_lt$2($3 dst, vReg src, immI idx) %{
  predicate(n->in(2)->get_int() < $2);
  match(Set dst (Extract$1 src idx));
  format %{ "extract$1_index_lt$2 $dst, $src, $idx\t# index < $2" %}
  ins_encode %{
    __ $4($dst$$Register, $src$$FloatRegister, __ $5, (int)($idx$$constant));
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl
dnl EXTRACT_INT_LARGE($1,   $2,    $3,       $4       )
dnl EXTRACT_INT_LARGE(type, index, arg_type, data_type)
define(`EXTRACT_INT_LARGE', `
instruct extract$1_index_ge$2($3 dst, vReg src, immI idx, vReg tmp) %{
  predicate(n->in(2)->get_int() >= $2);
  match(Set dst (Extract$1 src idx));
  effect(TEMP tmp);
  format %{ "extract$1_index_ge$2 $dst, $src, $idx\t# index >=$2. KILL $tmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    __ sve_extract_integral($dst$$Register, $4, $src$$FloatRegister,
                            (int)($idx$$constant), $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl

// BOOLEAN

instruct extractUB_ireg(iRegINoSp dst, vReg src, iRegI idx, vReg tmp) %{
  match(Set dst (ExtractUB src idx));
  effect(TEMP tmp);
  format %{ "extractUB_ireg $dst, $src, $idx\t# variable index. KILL $tmp" %}
  ins_encode %{
    // Input "src" is a vector of boolean represented as
    // bytes with 0x00/0x01 as element values.
    // "idx" is expected to be in range.

    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    __ mov($tmp$$FloatRegister, __ B, 0, $idx$$Register);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ tbl($tmp$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
             $src$$FloatRegister, 1, $tmp$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_tbl($tmp$$FloatRegister, __ B, $src$$FloatRegister, $tmp$$FloatRegister);
    }
    __ smov($dst$$Register, $tmp$$FloatRegister, __ B, 0);
  %}
  ins_pipe(pipe_slow);
%}
EXTRACT_INT_SMALL(UB, 16, iRegINoSp, smov, B)
EXTRACT_INT_LARGE(UB, 16, iRegINoSp, T_BYTE)

// BYTE
EXTRACT_INT_SMALL(B, 16, iRegINoSp, smov, B)
EXTRACT_INT_LARGE(B, 16, iRegINoSp, T_BYTE)

// SHORT
EXTRACT_INT_SMALL(S, 8, iRegINoSp, smov, H)
EXTRACT_INT_LARGE(S, 8, iRegINoSp, T_SHORT)

// INT
EXTRACT_INT_SMALL(I, 4, iRegINoSp, umov, S)
EXTRACT_INT_LARGE(I, 4, iRegINoSp, T_INT)

// LONG
EXTRACT_INT_SMALL(L, 2, iRegLNoSp, umov, D)
EXTRACT_INT_LARGE(L, 2, iRegLNoSp, T_LONG)

dnl
dnl EXTRACT_FP($1,   $2,   $3,    $4,   $5   )
dnl EXTRACT_FP(type, insn, index, size, shift)
define(`EXTRACT_FP', `
instruct extract$1(vReg$1 dst, vReg src, immI idx) %{
  match(Set dst (Extract$1 src idx));
  effect(TEMP_DEF dst);
  format %{ "extract$1 $dst, $src, $idx" %}
  ins_encode %{
    int index = (int)$idx$$constant;
    if (index == 0) {
      __ $2($dst$$FloatRegister, $src$$FloatRegister);
    } else if (index < $3) {
      __ ins($dst$$FloatRegister, __ $4, $src$$FloatRegister, 0, index);
    } else {
      assert(UseSVE > 0, "must be sve");
      __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
      __ sve_ext($dst$$FloatRegister, $dst$$FloatRegister, index << $5);
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// FLOAT
EXTRACT_FP(F, fmovs, 4, S, 2)

// DOUBLE
EXTRACT_FP(D, fmovd, 2, D, 3)

// ------------------------------ Vector mask load/store -----------------------

// vector load mask

instruct vloadmask_neon(vReg dst, vReg src) %{
  predicate(UseSVE == 0 &&
            (Matcher::vector_length_in_bytes(n) == 8 ||
             Matcher::vector_length_in_bytes(n) == 16));
  match(Set dst (VectorLoadMask src ));
  format %{ "vloadmask_neon $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    if (bt == T_BYTE) {
      uint length_in_bytes = Matcher::vector_length_in_bytes(this);
      __ negr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
              $src$$FloatRegister);
    } else {
      __ uxtl($dst$$FloatRegister, __ T8H, $src$$FloatRegister, __ T8B);
      if (type2aelembytes(bt) >= 4) {
        __ uxtl($dst$$FloatRegister, __ T4S, $dst$$FloatRegister, __ T4H);
      }
      if (type2aelembytes(bt) == 8) {
        __ uxtl($dst$$FloatRegister, __ T2D, $dst$$FloatRegister, __ T2S);
      }
      __ negr($dst$$FloatRegister, get_arrangement(this), $dst$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vloadmaskB_sve(pReg dst, vReg src, rFlagsReg cr) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
  match(Set dst (VectorLoadMask src));
  effect(KILL cr);
  format %{ "vloadmaskB_sve $dst, $src\t# KILL cr" %}
  ins_encode %{
    __ sve_cmp(Assembler::NE, $dst$$PRegister, __ B,
               ptrue, $src$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}

instruct vloadmask_extend_sve(pReg dst, vReg src, vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) != T_BYTE);
  match(Set dst (VectorLoadMask src));
  effect(TEMP tmp, KILL cr);
  format %{ "vloadmask_extend_sve $dst, $src\t# KILL $tmp, cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_vector_extend($tmp$$FloatRegister, size, $src$$FloatRegister, __ B);
    __ sve_cmp(Assembler::NE, $dst$$PRegister, size, ptrue, $tmp$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}

instruct vloadmaskB_masked(pReg dst, vReg src, pRegGov pg, rFlagsReg cr) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
  match(Set dst (VectorLoadMask src pg));
  effect(KILL cr);
  format %{ "vloadmaskB_masked $dst, $pg, $src\t# KILL cr" %}
  ins_encode %{
    __ sve_cmp(Assembler::NE, $dst$$PRegister, __ B,
               $pg$$PRegister, $src$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}

instruct vloadmask_extend_masked(pReg dst, vReg src, pRegGov pg, vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) != T_BYTE);
  match(Set dst (VectorLoadMask src pg));
  effect(TEMP tmp, KILL cr);
  format %{ "vloadmask_extend_masked $dst, $pg, $src\t# KILL $tmp, cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_vector_extend($tmp$$FloatRegister, size, $src$$FloatRegister, __ B);
    __ sve_cmp(Assembler::NE, $dst$$PRegister, size,
               $pg$$PRegister, $tmp$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}

// vector store mask - neon

instruct vstoremaskB_neon(vReg dst, vReg src, immI_1 size) %{
  predicate(UseSVE == 0);
  match(Set dst (VectorStoreMask src size));
  format %{ "vstoremaskB_neon $dst, $src" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
    __ negr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
            $src$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vstoremask_narrow_neon(vReg dst, vReg src, immI_gt_1 size) %{
  predicate(UseSVE == 0);
  match(Set dst (VectorStoreMask src size));
  format %{ "vstoremask_narrow_neon $dst, $src" %}
  ins_encode %{
    int esize = (int)$size$$constant;
    if (esize == 2) {
      __ xtn($dst$$FloatRegister, __ T8B, $src$$FloatRegister, __ T8H);
    } else if (esize == 4) {
      __ xtn($dst$$FloatRegister, __ T4H, $src$$FloatRegister, __ T4S);
      __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H);
    } else {
      assert(esize == 8, "must be");
      __ xtn($dst$$FloatRegister, __ T2S, $src$$FloatRegister, __ T2D);
      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
      __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H);
    }
    __ negr($dst$$FloatRegister, __ T8B, $dst$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// vector store mask - sve

instruct vstoremaskB_sve(vReg dst, pReg src, immI_1 size) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorStoreMask src size));
  format %{ "vstoremaskB_sve $dst, $src" %}
  ins_encode %{
    __ sve_cpy($dst$$FloatRegister, __ B, $src$$PRegister, 1, false);
  %}
  ins_pipe(pipe_slow);
%}

instruct vstoremask_narrow_sve(vReg dst, pReg src, immI_gt_1 size, vReg tmp) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorStoreMask src size));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vstoremask_narrow_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    Assembler::SIMD_RegVariant size = __ elemBytes_to_regVariant((int)$size$$constant);
    __ sve_cpy($dst$$FloatRegister, size, $src$$PRegister, 1, false);
    __ sve_vector_narrow($dst$$FloatRegister, __ B,
                         $dst$$FloatRegister, size, $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// Combined rules for vector mask load when the vector element type is not T_BYTE

// VectorLoadMask+LoadVector, and the VectorLoadMask is unpredicated.
instruct vloadmask_loadV(pReg dst, indirect mem, vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) > 1);
  match(Set dst (VectorLoadMask (LoadVector mem)));
  effect(TEMP tmp, KILL cr);
  format %{ "vloadmask_loadV $dst, $mem\t# KILL $tmp, cr" %}
  ins_encode %{
    // Load mask values which are boolean type, and extend them to the
    // defined vector element type. Convert the vector to predicate.
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    loadStoreA_predicated(masm, false, $tmp$$FloatRegister,
                          ptrue, T_BOOLEAN, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
    __ sve_cmp(Assembler::NE, $dst$$PRegister, __ elemType_to_regVariant(bt),
               ptrue, $tmp$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}

// VectorLoadMask+LoadVector, and the VectorLoadMask is predicated.
instruct vloadmask_loadV_masked(pReg dst, indirect mem, pRegGov pg,
                                vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) > 1);
  match(Set dst (VectorLoadMask (LoadVector mem) pg));
  effect(TEMP tmp, KILL cr);
  format %{ "vloadmask_loadV_masked $dst, $pg, $mem\t# KILL $tmp, cr" %}
  ins_encode %{
    // Load valid mask values which are boolean type, and extend them to the
    // defined vector element type. Convert the vector to predicate.
    BasicType bt = Matcher::vector_element_basic_type(this);
    loadStoreA_predicated(masm, false, $tmp$$FloatRegister,
                          $pg$$PRegister, T_BOOLEAN, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
    __ sve_cmp(Assembler::NE, $dst$$PRegister, __ elemType_to_regVariant(bt),
               $pg$$PRegister, $tmp$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}

// VectorLoadMask+LoadVectorMasked, and the VectorLoadMask is unpredicated.
instruct vloadmask_loadVMasked(pReg dst, vmemA mem, pRegGov pg, vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) > 1);
  match(Set dst (VectorLoadMask (LoadVectorMasked mem pg)));
  effect(TEMP tmp, KILL cr);
  format %{ "vloadmask_loadVMasked $dst, $mem\t# KILL $tmp, cr" %}
  ins_encode %{
    // Load mask values which are boolean type, and extend them to the
    // defined vector element type. Convert the vector to predicate.
    //
    // Note that we cannot use "pg" here, since it is the predicate used
    // for the vector load with boolean type. But the predicate used in
    // the extending "sve_ld1b" is based on the final extended vector type,
    // which is the full-sized predicate (ptrue) used in VectorLoadMask.
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    loadStoreA_predicated(masm, false, $tmp$$FloatRegister,
                          ptrue, T_BOOLEAN, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
    __ sve_cmp(Assembler::NE, $dst$$PRegister, __ elemType_to_regVariant(bt),
               ptrue, $tmp$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}

// VectorLoadMask+LoadVectorMasked, and the VectorLoadMask is predicated.
instruct vloadmask_loadVMasked_masked(pReg dst, vmemA mem, pRegGov pg1, pRegGov pg2,
                                      vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) > 1);
  match(Set dst (VectorLoadMask (LoadVectorMasked mem pg1) pg2));
  effect(TEMP tmp, KILL cr);
  format %{ "vloadmask_loadVMasked_masked $dst, $pg2, $mem\t# KILL $tmp, cr" %}
  ins_encode %{
    // Load valid mask values which are boolean type, and extend them to the
    // defined vector element type. Convert the vector to predicate.
    //
    // Note that we cannot use "pg1" here, since it is the predicate used
    // for the vector load with boolean type. But the predicate used in
    // the extending "sve_ld1b" is based on the final extended vector type,
    // which is the "pg2" used in VectorLoadMask.
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    loadStoreA_predicated(masm, false, $tmp$$FloatRegister,
                          $pg2$$PRegister, T_BOOLEAN, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
    __ sve_cmp(Assembler::NE, $dst$$PRegister, __ elemType_to_regVariant(bt),
               $pg2$$PRegister, $tmp$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}

// Combined rules for vector mask store when the vector element type is not T_BYTE

// StoreVector+VectorStoreMask, and the vector size of "src" is equal to the MaxVectorSize.
instruct storeV_vstoremask(indirect mem, pReg src, immI_gt_1 esize, vReg tmp) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length_in_bytes(n->as_StoreVector()->in(MemNode::ValueIn)->in(1)) == MaxVectorSize);
  match(Set mem (StoreVector mem (VectorStoreMask src esize)));
  effect(TEMP tmp);
  format %{ "storeV_vstoremask $mem, $src\t# KILL $tmp" %}
  ins_encode %{
    // Convert the valid src predicate to vector, and store the vector elements
    // as boolean values.
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    assert(type2aelembytes(bt) == (int)$esize$$constant, "unsupported type");
    Assembler::SIMD_RegVariant size = __ elemBytes_to_regVariant($esize$$constant);
    __ sve_cpy($tmp$$FloatRegister, size, $src$$PRegister, 1, false);
    loadStoreA_predicated(masm, true, $tmp$$FloatRegister,
                          ptrue, T_BOOLEAN, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}
  ins_pipe(pipe_slow);
%}

// StoreVector+VectorStoreMask, and the vector size of "src" is less than the MaxVectorSize.
instruct storeV_vstoremask_masked(indirect mem, pReg src, immI_gt_1 esize,
                                  vReg tmp, pRegGov pgtmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length_in_bytes(n->as_StoreVector()->in(MemNode::ValueIn)->in(1)) < MaxVectorSize);
  match(Set mem (StoreVector mem (VectorStoreMask src esize)));
  effect(TEMP tmp, TEMP pgtmp, KILL cr);
  format %{ "storeV_vstoremask_masked $mem, $src\t# KILL $tmp, $pgtmp, cr" %}
  ins_encode %{
    // Convert the valid src predicate to vector, and store the vector elements
    // as boolean values.
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_cpy($tmp$$FloatRegister, size, $src$$PRegister, 1, false);
    __ sve_gen_mask_imm($pgtmp$$PRegister, bt, Matcher::vector_length(this, $src));
    loadStoreA_predicated(masm, true, $tmp$$FloatRegister,
                          $pgtmp$$PRegister, T_BOOLEAN, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}
  ins_pipe(pipe_slow);
%}

// StoreVectorMasked+VectorStoreMask, and the vector size of "src" is equal to the MaxVectorSize.
instruct storeVMasked_vstoremask(vmemA mem, pReg src, pRegGov pg, immI_gt_1 esize, vReg tmp) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length_in_bytes(n->as_StoreVector()->in(MemNode::ValueIn)->in(1)) == MaxVectorSize);
  match(Set mem (StoreVectorMasked mem (Binary (VectorStoreMask src esize) pg)));
  effect(TEMP tmp);
  format %{ "storeVMasked_vstoremask $mem, $src\t# KILL $tmp" %}
  ins_encode %{
    // Convert the valid src predicate to vector, and store the vector elements
    // as boolean values.
    //
    // Note that we cannot use "pg" here, since it is the predicate used
    // for the vector store with boolean type. But the predicate used in
    // the narrowing "sve_st1b" is based on the "src" vector type, which
    // is the full-sized predicate (ptrue) here.
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    assert(type2aelembytes(bt) == (int)$esize$$constant, "unsupported type.");
    Assembler::SIMD_RegVariant size = __ elemBytes_to_regVariant($esize$$constant);
    __ sve_cpy($tmp$$FloatRegister, size, $src$$PRegister, 1, false);
    loadStoreA_predicated(masm, true, $tmp$$FloatRegister,
                          ptrue, T_BOOLEAN, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}
  ins_pipe(pipe_slow);
%}

// StoreVectorMasked+VectorStoreMask, and the vector size of "src" is less than the MaxVectorSize.
instruct storeVMasked_vstoremask_masked(vmemA mem, pReg src, pRegGov pg, immI_gt_1 esize,
                                        vReg tmp, pRegGov pgtmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length_in_bytes(n->as_StoreVector()->in(MemNode::ValueIn)->in(1)) < MaxVectorSize);
  match(Set mem (StoreVectorMasked mem (Binary (VectorStoreMask src esize) pg)));
  effect(TEMP tmp, TEMP pgtmp, KILL cr);
  format %{ "storeVMasked_vstoremask_masked $mem, $src\t# KILL $tmp, $pgtmp, cr" %}
  ins_encode %{
    // Convert the valid src predicate to vector, and store the vector elements
    // as boolean values.
    //
    // Note that we cannot use "pg" here, since it is the predicate used for the
    // vector store with boolean type. But the predicate used in the narrowing
    // "sve_st1b" is based on the "src" vector type, which needed to be generated
    // here.
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_cpy($tmp$$FloatRegister, size, $src$$PRegister, 1, false);
    __ sve_gen_mask_imm($pgtmp$$PRegister, bt, Matcher::vector_length(this, $src));
    loadStoreA_predicated(masm, true, $tmp$$FloatRegister,
                          $pgtmp$$PRegister, T_BOOLEAN, bt, $mem->opcode(),
                          as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector mask basic OPs ------------------------

dnl
dnl VMASK_BITWISE_OP($1,   $2,      $3  )
dnl VMASK_BITWISE_OP(type, op_name, insn)
define(`VMASK_BITWISE_OP', `
instruct vmask_$1(pReg pd, pReg pn, pReg pm) %{
  predicate(UseSVE > 0);
  match(Set pd ($2 pn pm));
  format %{ "vmask_$1 $pd, $pn, $pm" %}
  ins_encode %{
    __ $3($pd$$PRegister, ptrue, $pn$$PRegister, $pm$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMASK_AND_NOT($1  )
dnl VMASK_AND_NOT(type)
define(`VMASK_AND_NOT', `
instruct vmask_and_not$1(pReg pd, pReg pn, pReg pm, imm$1_M1 m1) %{
  predicate(UseSVE > 0);
  match(Set pd (AndVMask pn (XorVMask pm (MaskAll m1))));
  format %{ "vmask_and_not$1 $pd, $pn, $pm" %}
  ins_encode %{
    __ sve_bic($pd$$PRegister, ptrue, $pn$$PRegister, $pm$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// vector mask logical ops: and/or/xor/and_not
VMASK_BITWISE_OP(and, AndVMask, sve_and)
VMASK_BITWISE_OP(or,  OrVMask,  sve_orr)
VMASK_BITWISE_OP(xor, XorVMask, sve_eor)
VMASK_AND_NOT(I)
VMASK_AND_NOT(L)

// vector mask compare

instruct vmaskcmp_neon(vReg dst, vReg src1, vReg src2, immI cond) %{
  predicate(UseSVE == 0 &&
            (Matcher::vector_length_in_bytes(n) == 8 ||
             Matcher::vector_length_in_bytes(n) == 16));
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
  format %{ "vmaskcmp_neon $dst, $src1, $src2, $cond" %}
  ins_encode %{
    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    __ neon_compare($dst$$FloatRegister, bt, $src1$$FloatRegister,
                    $src2$$FloatRegister, condition, /* isQ */ length_in_bytes == 16);
  %}
  ins_pipe(pipe_slow);
%}
dnl VMASKCMP_ZERO_NEON($1,   $2        )
dnl VMASKCMP_ZERO_NEON(type, basic_type)
define(`VMASKCMP_ZERO_NEON', `
instruct vmaskcmp_zero$1_neon(vReg dst, vReg src, imm`$1'0 zero, immI_cmp_cond cond) %{
  predicate(UseSVE == 0);
  match(Set dst (VectorMaskCmp (Binary src (Replicate zero)) cond));
  format %{ "vmaskcmp_zero$1_neon $dst, $src, #0, $cond" %}
  ins_encode %{
    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    __ neon_compare_zero($dst$$FloatRegister, ifelse($1, I, Matcher::vector_element_basic_type(this), $2),
                         $src$$FloatRegister,
                         condition, /* isQ */ length_in_bytes == 16);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
VMASKCMP_ZERO_NEON(I)
VMASKCMP_ZERO_NEON(L, T_LONG)
VMASKCMP_ZERO_NEON(F, T_FLOAT)
VMASKCMP_ZERO_NEON(D, T_DOUBLE)

instruct vmaskcmp_sve(pReg dst, vReg src1, vReg src2, immI cond, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
  effect(KILL cr);
  format %{ "vmaskcmp_sve $dst, $src1, $src2, $cond\t# KILL cr" %}
  ins_encode %{
    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_compare($dst$$PRegister, bt, ptrue, $src1$$FloatRegister,
                   $src2$$FloatRegister, condition);
  %}
  ins_pipe(pipe_slow);
%}
dnl
dnl VMASKCMP_SVE_IMM($1          , $2          , $3      , $4            )
dnl VMASKCMP_SVE_IMM(element_size, element_type, type_imm, type_condition)
define(`VMASKCMP_SVE_IMM', `
instruct vmask$3_imm$1_sve(pReg dst, vReg src, $2 imm, immI_$3_cond cond, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorMaskCmp (Binary src (Replicate imm)) cond));
  effect(KILL cr);
  format %{ "vmask$3_imm$1_sve $dst, $src, $imm, $cond\t# KILL cr" %}
  ins_encode %{
    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_cmp(condition, $dst$$PRegister, get_reg_variant(in(operand_index($src))),
               ptrue, $src$$FloatRegister, (int)$imm$$constant);
  %}
  ins_pipe(pipe_slow);
%}')dnl
VMASKCMP_SVE_IMM(I, immI5,   cmp)
VMASKCMP_SVE_IMM(I, immIU7,  cmpU)
VMASKCMP_SVE_IMM(L, immL5,   cmp)
VMASKCMP_SVE_IMM(L, immLU7,  cmpU)

instruct vmaskcmp_masked(pReg dst, vReg src1, vReg src2, immI cond,
                         pRegGov pg, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond pg)));
  effect(KILL cr);
  format %{ "vmaskcmp_masked $dst, $pg, $src1, $src2, $cond\t# KILL cr" %}
  ins_encode %{
    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_compare($dst$$PRegister, bt, $pg$$PRegister, $src1$$FloatRegister,
                   $src2$$FloatRegister, condition);
  %}
  ins_pipe(pipe_slow);
%}

// vector mask cast

instruct vmaskcast_same_esize_neon(vReg dst_src) %{
  predicate(UseSVE == 0 &&
            Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)) &&
            (Matcher::vector_length_in_bytes(n) == 8 || Matcher::vector_length_in_bytes(n) == 16));
  match(Set dst_src (VectorMaskCast dst_src));
  ins_cost(0);
  format %{ "vmaskcast_same_esize_neon $dst_src\t# do nothing" %}
  ins_encode(/* empty encoding */);
  ins_pipe(pipe_class_empty);
%}

instruct vmaskcast_extend_neon(vReg dst, vReg src) %{
  predicate(UseSVE == 0 &&
            Matcher::vector_length_in_bytes(n) > Matcher::vector_length_in_bytes(n->in(1)));
  match(Set dst (VectorMaskCast src));
  format %{ "vmaskcast_extend_neon $dst, $src" %}
  ins_encode %{
    BasicType dst_bt = Matcher::vector_element_basic_type(this);
    if (is_floating_point_type(dst_bt)) {
      dst_bt = (dst_bt == T_FLOAT) ? T_INT : T_LONG;
    }
    uint length_in_bytes_dst = Matcher::vector_length_in_bytes(this);
    BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
    if (is_floating_point_type(src_bt)) {
      src_bt = (src_bt == T_FLOAT) ? T_INT : T_LONG;
    }
    __ neon_vector_extend($dst$$FloatRegister, dst_bt, length_in_bytes_dst,
                          $src$$FloatRegister, src_bt);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmaskcast_narrow_neon(vReg dst, vReg src) %{
  predicate(UseSVE == 0 &&
            Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)));
  match(Set dst (VectorMaskCast src));
  format %{ "vmaskcast_narrow_neon $dst, $src" %}
  ins_encode %{
    BasicType dst_bt = Matcher::vector_element_basic_type(this);
    if (is_floating_point_type(dst_bt)) {
      dst_bt = (dst_bt == T_FLOAT) ? T_INT : T_LONG;
    }
    BasicType src_bt = Matcher::vector_element_basic_type(this, $src);
    if (is_floating_point_type(src_bt)) {
      src_bt = (src_bt == T_FLOAT) ? T_INT : T_LONG;
    }
    uint length_in_bytes_src = Matcher::vector_length_in_bytes(this, $src);
    __ neon_vector_narrow($dst$$FloatRegister, dst_bt,
                          $src$$FloatRegister, src_bt, length_in_bytes_src);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmaskcast_same_esize_sve(pReg dst_src) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
  match(Set dst_src (VectorMaskCast dst_src));
  ins_cost(0);
  format %{ "vmaskcast_same_esize_sve $dst_src\t# do nothing" %}
  ins_encode(/* empty encoding */);
  ins_pipe(pipe_class_empty);
%}

instruct vmaskcast_extend_sve(pReg dst, pReg src) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length_in_bytes(n) > Matcher::vector_length_in_bytes(n->in(1)));
  match(Set dst (VectorMaskCast src));
  format %{ "vmaskcast_extend_sve $dst, $src" %}
  ins_encode %{
    uint length_in_bytes_dst = Matcher::vector_length_in_bytes(this);
    uint length_in_bytes_src = Matcher::vector_length_in_bytes(this, $src);
    assert(length_in_bytes_dst == 2 * length_in_bytes_src ||
           length_in_bytes_dst == 4 * length_in_bytes_src ||
           length_in_bytes_dst == 8 * length_in_bytes_src, "invalid vector length");
    __ sve_vmaskcast_extend($dst$$PRegister, $src$$PRegister,
                            length_in_bytes_dst, length_in_bytes_src);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmaskcast_narrow_sve(pReg dst, pReg src, pReg ptmp) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)));
  match(Set dst (VectorMaskCast src));
  effect(TEMP_DEF dst, TEMP ptmp);
  format %{ "vmaskcast_narrow_sve $dst, $src\t# KILL $ptmp" %}
  ins_encode %{
    uint length_in_bytes_dst = Matcher::vector_length_in_bytes(this);
    uint length_in_bytes_src = Matcher::vector_length_in_bytes(this, $src);
    assert(length_in_bytes_dst * 2 == length_in_bytes_src ||
           length_in_bytes_dst * 4 == length_in_bytes_src ||
           length_in_bytes_dst * 8 == length_in_bytes_src, "invalid vector length");
    __ sve_vmaskcast_narrow($dst$$PRegister, $src$$PRegister, $ptmp$$PRegister,
                            length_in_bytes_dst, length_in_bytes_src);
  %}
  ins_pipe(pipe_slow);
%}

// vector mask reinterpret

instruct vmask_reinterpret_same_esize(pReg dst_src) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length(n) == Matcher::vector_length(n->in(1)) &&
            Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
  match(Set dst_src (VectorReinterpret dst_src));
  ins_cost(0);
  format %{ "vmask_reinterpret_same_esize $dst_src\t# do nothing" %}
  ins_encode(/* empty encoding */);
  ins_pipe(pipe_class_empty);
%}

instruct vmask_reinterpret_diff_esize(pReg dst, pReg src, vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0 &&
            Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
            Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1)));
  match(Set dst (VectorReinterpret src));
  effect(TEMP tmp, KILL cr);
  format %{ "vmask_reinterpret_diff_esize $dst, $src\t# KILL $tmp, cr" %}
  ins_encode %{
    BasicType from_bt = Matcher::vector_element_basic_type(this, $src);
    Assembler::SIMD_RegVariant from_size = __ elemType_to_regVariant(from_bt);
    BasicType to_bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant to_size = __ elemType_to_regVariant(to_bt);
    __ sve_cpy($tmp$$FloatRegister, from_size, $src$$PRegister, -1, false);
    __ sve_cmp(Assembler::EQ, $dst$$PRegister, to_size, ptrue, $tmp$$FloatRegister, -1);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector mask reductions -----------------------

// true count

instruct vmask_truecount_neon(iRegINoSp dst, vReg src, vReg tmp) %{
  predicate(UseSVE == 0);
  match(Set dst (VectorMaskTrueCount src));
  effect(TEMP tmp);
  format %{ "vmask_truecount_neon $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    // Input "src" is a vector of boolean represented as bytes with
    // 0x00/0x01 as element values.
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    assert(bt == T_BOOLEAN, "unsupported type");
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    __ addv($tmp$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
            $src$$FloatRegister);
    __ umov($dst$$Register, $tmp$$FloatRegister, __ B, 0);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmask_truecount_sve(iRegINoSp dst, pReg src) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorMaskTrueCount src));
  format %{ "vmask_truecount_sve $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    __ sve_cntp($dst$$Register, __ elemType_to_regVariant(bt),
                ptrue, $src$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}

// Combined rule for VectorMaskTrueCount (VectorStoreMask) when the vector element type is not T_BYTE.

instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg vtmp) %{
  match(Set dst (VectorMaskTrueCount (VectorStoreMask src size)));
  effect(TEMP vtmp);
  format %{ "vstoremask_truecount_neon $dst, $src\t# KILL $vtmp" %}
  ins_encode %{
    // Input "src" is a vector mask represented as lanes with
    // 0/-1 as element values.
    uint esize = (uint)$size$$constant;
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    Assembler::SIMD_Arrangement arrangement = Assembler::esize2arrangement(esize,
                                                                           /* isQ */ length_in_bytes == 16);
    if (arrangement == __ T2D || arrangement == __ T2S) {
      __ addpv($vtmp$$FloatRegister, arrangement, $src$$FloatRegister, $src$$FloatRegister);
    } else {
      __ addv($vtmp$$FloatRegister, arrangement, $src$$FloatRegister);
    }
    __ smov($dst$$Register, $vtmp$$FloatRegister, __ B, 0);
    __ neg($dst$$Register, $dst$$Register);
  %}
  ins_pipe(pipe_slow);
%}

// first true

instruct vmask_firsttrue_neon(iRegINoSp dst, vReg src) %{
  predicate(UseSVE == 0);
  match(Set dst (VectorMaskFirstTrue src));
  format %{ "vmask_firsttrue_neon $dst, $src" %}
  ins_encode %{
    // Returns the index of the first active lane of the
    // vector mask, or VLENGTH if no lane is active.
    //
    // Input "src" is a vector of boolean represented as
    // bytes with 0x00/0x01 as element values.
    //
    // Computed by reversing the bits and counting the leading
    // zero bytes.

    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    assert(bt == T_BOOLEAN, "unsupported type");
    uint vlength = Matcher::vector_length(this, $src);
    if (vlength <= 8) {
      __ fmovd($dst$$Register, $src$$FloatRegister);
      if (vlength == 2 || vlength == 4) {
        // Special handling for 2B or 4B cases:
        // Vector mask is moved to a 64-bit general register, but only the low 16/32 bits are
        // significant for 2B/4B cases. We initialize the 16th/32nd bit as bit 1, so as to generate
        // the expected result (i.e. VLENGTH) for the case that all lanes are zero.
        __ orr($dst$$Register, $dst$$Register, vlength == 2 ? 0x10000 : 0x100000000);
      }
      __ rbit($dst$$Register, $dst$$Register);
      __ clz($dst$$Register, $dst$$Register);
      __ lsrw($dst$$Register, $dst$$Register, 3);
    } else {
      assert(vlength == 16, "must be");
      Label FIRST_TRUE_INDEX;

      // Try to compute the result from lower 64 bits.
      __ fmovd($dst$$Register, $src$$FloatRegister);
      __ movw(rscratch1, zr);
      __ cbnz($dst$$Register, FIRST_TRUE_INDEX);

      // Compute the result from the higher 64 bits.
      __ fmovhid($dst$$Register, $src$$FloatRegister);
      __ movw(rscratch1, 8);

      // Reverse the bits and count the leading zero bytes.
      __ bind(FIRST_TRUE_INDEX);
      __ rbit($dst$$Register, $dst$$Register);
      __ clz($dst$$Register, $dst$$Register);
      __ addw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
    }
  %}
  ins_pipe(pipe_slow);
%}

// Return the index of the first mask lane that is set, or vector length if none of
// them are set.

instruct vmask_firsttrue_sve(iRegINoSp dst, pReg src, pReg ptmp) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorMaskFirstTrue src));
  effect(TEMP ptmp);
  format %{ "vmask_firsttrue_sve $dst, $src\t# KILL $ptmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_brkb($ptmp$$PRegister, ptrue, $src$$PRegister, false);
    __ sve_cntp($dst$$Register, __ elemType_to_regVariant(bt), ptrue, $ptmp$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmask_firsttrue_masked(iRegINoSp dst, pReg src, pReg pg, pReg ptmp) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorMaskFirstTrue src pg));
  effect(TEMP ptmp);
  format %{ "vmask_firsttrue_masked $dst, $pg, $src\t# KILL $ptmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    __ sve_brkb($ptmp$$PRegister, $pg$$PRegister, $src$$PRegister, false);
    __ sve_cntp($dst$$Register, __ elemType_to_regVariant(bt), ptrue, $ptmp$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}

// last true

instruct vmask_lasttrue_neon(iRegINoSp dst, vReg src) %{
  predicate(UseSVE == 0);
  match(Set dst (VectorMaskLastTrue src));
  format %{ "vmask_lasttrue_neon $dst, $src" %}
  ins_encode %{
    // Returns the index of the last active lane of the
    // vector mask, or -1 if no lane is active.
    //
    // Input "src" is a vector of boolean represented as
    // bytes with 0x00/0x01 as element values.

    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    assert(bt == T_BOOLEAN, "unsupported type");
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    if (length_in_bytes <= 8) {
      // Computed by counting the leading zero bytes and
      // subtracting it by 7 (VLENGTH - 1).
      __ fmovd($dst$$Register, $src$$FloatRegister);
      __ clz($dst$$Register, $dst$$Register);
      __ movw(rscratch1, 7);
      __ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
    } else {
      assert(length_in_bytes == 16, "must be");
      Label LAST_TRUE_INDEX;

      // Try to compute the result from higher 64 bits.
      __ fmovhid($dst$$Register, $src$$FloatRegister);
      __ movw(rscratch1, 16 - 1);
      __ cbnz($dst$$Register, LAST_TRUE_INDEX);

      // Compute the result from the lower 64 bits.
      __ fmovd($dst$$Register, $src$$FloatRegister);
      __ movw(rscratch1, 8 - 1);

      // Count the leading zero bytes and subtract it by 15 (VLENGTH - 1).
      __ bind(LAST_TRUE_INDEX);
      __ clz($dst$$Register, $dst$$Register);
      __ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vmask_lasttrue_sve(iRegINoSp dst, pReg src, pReg ptmp) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorMaskLastTrue src));
  effect(TEMP ptmp);
  format %{ "vmask_lasttrue_sve $dst, $src\t# KILL $ptmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this, $src);
    __ sve_vmask_lasttrue($dst$$Register, bt, $src$$PRegister, $ptmp$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}

// tolong

instruct vmask_tolong_neon(iRegLNoSp dst, vReg src) %{
  predicate(UseSVE == 0);
  match(Set dst (VectorMaskToLong src));
  format %{ "vmask_tolong_neon $dst, $src" %}
  ins_encode %{
    // Input "src" is a vector of boolean represented as
    // bytes with 0x00/0x01 as element values.

    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    if (length_in_bytes <= 8) {
      __ fmovd($dst$$Register, $src$$FloatRegister);
      __ bytemask_compress($dst$$Register);
    } else {
      assert(length_in_bytes == 16, "must be");
      __ umov($dst$$Register, $src$$FloatRegister, __ D, 0);
      __ umov(rscratch1, $src$$FloatRegister, __ D, 1);
      __ bytemask_compress($dst$$Register);
      __ bytemask_compress(rscratch1);
      __ orr($dst$$Register, $dst$$Register, rscratch1, Assembler::LSL, 8);
    }
  %}
  ins_pipe(pipe_slow);
%}

instruct vmask_tolong_sve(iRegLNoSp dst, vReg src, vReg tmp) %{
  predicate(UseSVE > 0 && !VM_Version::supports_svebitperm());
  match(Set dst (VectorMaskToLong src));
  effect(TEMP tmp);
  format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp" %}
  ins_encode %{
    // Input "src" is a vector of boolean represented as
    // bytes with 0x00/0x01 as element values.
    __ sve_vmask_tolong($dst$$Register, $src$$FloatRegister,
                        $tmp$$FloatRegister, Matcher::vector_length(this, $src));
  %}
  ins_pipe(pipe_slow);
%}

instruct vmask_tolong_sve2(iRegLNoSp dst, vReg src, vReg tmp1, vReg tmp2) %{
  predicate(VM_Version::supports_svebitperm());
  match(Set dst (VectorMaskToLong src));
  effect(TEMP tmp1, TEMP tmp2);
  format %{ "vmask_tolong_sve2 $dst, $src\t# KILL $tmp1, $tmp2" %}
  ins_encode %{
    // Input "src" is a vector of boolean represented as
    // bytes with 0x00/0x01 as element values.
    __ sve2_vmask_tolong($dst$$Register, $src$$FloatRegister,
                         $tmp1$$FloatRegister, $tmp2$$FloatRegister,
                         Matcher::vector_length(this, $src));
  %}
  ins_pipe(pipe_slow);
%}

// fromlong

instruct vmask_fromlong(vReg dst, iRegL src, vReg tmp) %{
  match(Set dst (VectorLongToMask src));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp" %}
  ins_encode %{
    __ sve_vmask_fromlong($dst$$FloatRegister, $src$$Register,
                          $tmp$$FloatRegister, Matcher::vector_length(this));
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector mask generation -----------------------

// maskAll
dnl
dnl VMASKALL_IMM($1,   $2      )
dnl VMASKALL_IMM(type, var_type)
define(`VMASKALL_IMM', `
instruct vmaskAll_imm$1(pReg dst, imm$1 src, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set dst (MaskAll src));
  effect(KILL cr);
  format %{ "vmaskAll_imm$1 $dst, $src\t# KILL cr" %}
  ins_encode %{
    $2 con = ($2)$src$$constant;
    if (con == 0) {
      __ sve_pfalse($dst$$PRegister);
    } else {
      assert(con == -1, "invalid constant value for mask");
      BasicType bt = Matcher::vector_element_basic_type(this);
      __ sve_gen_mask_imm($dst$$PRegister, bt, Matcher::vector_length(this));
    }
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMASKALL($1,   $2      )
dnl VMASKALL(type, arg_type)
define(`VMASKALL', `
instruct vmaskAll$1(pReg dst, $2 src, vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set dst (MaskAll src));
  effect(TEMP tmp, KILL cr);
  format %{ "vmaskAll$1 $dst, $src\t# KILL $tmp, cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_dup($tmp$$FloatRegister, size, $src$$Register);
    __ sve_cmp(Assembler::NE, $dst$$PRegister, size, ptrue, $tmp$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
dnl VMASKALL_PREDICATE($1,   $2      )
dnl VMASKALL_PREDICATE(type, arg_type)
define(`VMASKALL_PREDICATE', `
instruct vmaskAll$1_masked(pReg dst, $2 src, pRegGov pg, vReg tmp, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set dst (MaskAll src pg));
  effect(TEMP tmp, KILL cr);
  format %{ "vmaskAll$1_masked $dst, $pg, $src\t# KILL $tmp, cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_dup($tmp$$FloatRegister, size, $src$$Register);
    __ sve_cmp(Assembler::NE, $dst$$PRegister, size,
               $pg$$PRegister, $tmp$$FloatRegister, 0);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
VMASKALL_IMM(I, int)
VMASKALL(I, iRegIorL2I)
VMASKALL_PREDICATE(I, iRegIorL2I)
VMASKALL_IMM(L, long)
VMASKALL(L, iRegL)
VMASKALL_PREDICATE(L, iRegL)

// vetcor mask generation

instruct vmask_gen_I(pReg pd, iRegIorL2I src, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set pd (VectorMaskGen (ConvI2L src)));
  effect(KILL cr);
  format %{ "vmask_gen_I $pd, $src\t# KILL cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_whileltw($pd$$PRegister, __ elemType_to_regVariant(bt), zr, $src$$Register);
  %}
  ins_pipe(pipe_class_default);
%}

instruct vmask_gen_L(pReg pd, iRegL src, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set pd (VectorMaskGen src));
  effect(KILL cr);
  format %{ "vmask_gen_L $pd, $src\t# KILL cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_whilelt($pd$$PRegister, __ elemType_to_regVariant(bt), zr, $src$$Register);
  %}
  ins_pipe(pipe_slow);
%}

instruct vmask_gen_imm(pReg pd, immL con, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set pd (VectorMaskGen con));
  effect(KILL cr);
  format %{ "vmask_gen_imm $pd, $con\t# KILL cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_gen_mask_imm($pd$$PRegister, bt, (uint)($con$$constant));
  %}
  ins_pipe(pipe_slow);
%}

instruct vmask_gen_sub(pReg pd, iRegL src1, iRegL src2, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set pd (VectorMaskGen (SubL src1 src2)));
  effect(KILL cr);
  format %{ "vmask_gen_sub $pd, $src2, $src1\t# KILL cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_whilelt($pd$$PRegister, __ elemType_to_regVariant(bt), $src2$$Register, $src1$$Register);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Popcount vector ------------------------------

// vector popcount - INT

instruct vpopcountI(vReg dst, vReg src) %{
  match(Set dst (PopCountVI src));
  format %{ "vpopcountI $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (bt == T_BYTE) {
      if (VM_Version::use_neon_for_vector(length_in_bytes)) {
        __ cnt($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
               $src$$FloatRegister);
      } else {
        assert(UseSVE > 0, "must be sve");
        __ sve_cnt($dst$$FloatRegister, __ B, ptrue, $src$$FloatRegister);
      }
    } else {
      assert(bt == T_SHORT || bt == T_INT, "unsupported");
      if (UseSVE == 0) {
        assert(length_in_bytes <= 16, "unsupported");
        bool isQ = length_in_bytes == 16;
        __ cnt($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $src$$FloatRegister);
        __ uaddlp($dst$$FloatRegister, isQ ? __ T16B : __ T8B, $dst$$FloatRegister);
        if (bt == T_INT) {
          __ uaddlp($dst$$FloatRegister, isQ ? __ T8H : __ T4H, $dst$$FloatRegister);
        }
      } else {
        __ sve_cnt($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                   ptrue, $src$$FloatRegister);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// vector popcount - LONG

instruct vpopcountL(vReg dst, vReg src) %{
  match(Set dst (PopCountVL src));
  format %{ "vpopcountL $dst, $src" %}
  ins_encode %{
    if (UseSVE == 0) {
      __ cnt($dst$$FloatRegister, __ T16B, $src$$FloatRegister);
      __ uaddlp($dst$$FloatRegister, __ T16B, $dst$$FloatRegister);
      __ uaddlp($dst$$FloatRegister, __ T8H, $dst$$FloatRegister);
      __ uaddlp($dst$$FloatRegister, __ T4S, $dst$$FloatRegister);
    } else {
      __ sve_cnt($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

// vector popcount - predicated
UNARY_OP_PREDICATE(vpopcountI, PopCountVI, sve_cnt)

instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src (PopCountVL dst_src pg));
  format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %}
  ins_encode %{
    __ sve_cnt($dst_src$$FloatRegister, __ D,
               $pg$$PRegister, $dst_src$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector blend ---------------------------------

instruct vblend_neon(vReg dst, vReg src1, vReg src2) %{
  predicate(UseSVE == 0);
  match(Set dst (VectorBlend (Binary src1 src2) dst));
  format %{ "vblend_neon $dst, $src1, $src2" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes <= 16, "must be");
    __ bsl($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
           $src2$$FloatRegister, $src1$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{
  predicate(UseSVE > 0);
  match(Set dst (VectorBlend (Binary src1 src2) pg));
  format %{ "vblend_sve $dst, $pg, $src1, $src2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_sel($dst$$FloatRegister, __ elemType_to_regVariant(bt),
               $pg$$PRegister, $src2$$FloatRegister, $src1$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector round ---------------------------------

// vector Math.round

instruct vround_le128b(vReg dst, vReg src, vReg tmp1, vReg tmp2,
                       vReg tmp3, rFlagsReg cr) %{
  predicate(Matcher::vector_length_in_bytes(n) <= 16);
  match(Set dst (RoundVF src));
  match(Set dst (RoundVD src));
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
  format %{ "vround_le128b $dst, $src\t# vector <= 128 bits. KILL $tmp1, $tmp2, $tmp3, cr" %}
  ins_encode %{
    __ vector_round_neon($dst$$FloatRegister, $src$$FloatRegister,
                         $tmp1$$FloatRegister, $tmp2$$FloatRegister,
                         $tmp3$$FloatRegister, get_arrangement(this));
  %}
  ins_pipe(pipe_slow);
%}

instruct vround_gt128b(vReg dst, vReg src, vReg tmp1, vReg tmp2,
                       pRegGov pgtmp, rFlagsReg cr) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16);
  match(Set dst (RoundVF src));
  match(Set dst (RoundVD src));
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp, KILL cr);
  format %{ "vround_gt128b $dst, $src\t# vector > 128 bits. KILL $tmp1, $tmp2, $pgtmp, cr" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ vector_round_sve($dst$$FloatRegister, $src$$FloatRegister,
                        $tmp1$$FloatRegister, $tmp2$$FloatRegister,
                        $pgtmp$$PRegister, __ elemType_to_regVariant(bt));
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ RoundDouble ----------------------------------

// vector Math.rint, floor, ceil

instruct vroundD(vReg dst, vReg src, immI rmode) %{
  predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
  match(Set dst (RoundDoubleModeV src rmode));
  format %{ "vroundD $dst, $src, $rmode" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      switch ($rmode$$constant) {
        case RoundDoubleModeNode::rmode_rint:
          __ frintn($dst$$FloatRegister, __ T2D, $src$$FloatRegister);
          break;
        case RoundDoubleModeNode::rmode_floor:
          __ frintm($dst$$FloatRegister, __ T2D, $src$$FloatRegister);
          break;
        case RoundDoubleModeNode::rmode_ceil:
          __ frintp($dst$$FloatRegister, __ T2D, $src$$FloatRegister);
          break;
        default:
          assert(false, "unsupported");
          ShouldNotReachHere();
      }
    } else {
      assert(UseSVE > 0, "must be sve");
      switch ($rmode$$constant) {
        case RoundDoubleModeNode::rmode_rint:
          __ sve_frintn($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
          break;
        case RoundDoubleModeNode::rmode_floor:
          __ sve_frintm($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
          break;
        case RoundDoubleModeNode::rmode_ceil:
          __ sve_frintp($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
          break;
        default:
          assert(false, "unsupported");
          ShouldNotReachHere();
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ VectorTest -----------------------------------

// anytrue

instruct vtest_anytrue_neon(rFlagsReg cr, vReg src1, vReg src2, vReg tmp) %{
  predicate(UseSVE == 0 &&
            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
  match(Set cr (VectorTest src1 src2));
  effect(TEMP tmp);
  format %{ "vtest_anytrue_neon $src1\t# KILL $tmp" %}
  ins_encode %{
    // No need to use src2.
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src1);
    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
    __ addv($tmp$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B, $src1$$FloatRegister);
    __ umov(rscratch1, $tmp$$FloatRegister, __ B, 0);
    __ cmpw(rscratch1, zr);
  %}
  ins_pipe(pipe_slow);
%}

instruct vtest_anytrue_sve(rFlagsReg cr, pReg src1, pReg src2) %{
  predicate(UseSVE > 0 &&
            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
  match(Set cr (VectorTest src1 src2));
  format %{ "vtest_anytrue_sve $src1" %}
  ins_encode %{
    // "src2" is not used for sve.
    __ sve_ptest(ptrue, $src1$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}

// alltrue

instruct vtest_alltrue_neon(rFlagsReg cr, vReg src1, vReg src2, vReg tmp) %{
  predicate(UseSVE == 0 &&
            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
  match(Set cr (VectorTest src1 src2));
  effect(TEMP tmp);
  format %{ "vtest_alltrue_neon $src1\t# KILL $tmp" %}
  ins_encode %{
    // No need to use src2.
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src1);
    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
    __ uminv($tmp$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B, $src1$$FloatRegister);
    __ umov(rscratch1, $tmp$$FloatRegister, __ B, 0);
    __ cmpw(rscratch1, 0xff);
  %}
  ins_pipe(pipe_slow);
%}

instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
  predicate(UseSVE > 0 &&
            static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
  match(Set cr (VectorTest src1 src2));
  effect(TEMP ptmp);
  format %{ "vtest_alltrue_sve $src1, $src2\t# KILL $ptmp" %}
  ins_encode %{
    __ sve_eors($ptmp$$PRegister, ptrue, $src1$$PRegister, $src2$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector rearrange -----------------------------

instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{
  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE);
  match(Set dst (VectorRearrange src shuffle));
  effect(TEMP_DEF dst, TEMP tmp);
  format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
    __ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister,
                          $shuffle$$FloatRegister, $tmp$$FloatRegister,
                          bt, length_in_bytes == 16);
  %}
  ins_pipe(pipe_slow);
%}

instruct rearrange(vReg dst, vReg src, vReg shuffle) %{
  predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE);
  match(Set dst (VectorRearrange src shuffle));
  format %{ "rearrange $dst, $src, $shuffle" %}
  ins_encode %{
    BasicType bt_dst = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (bt_dst == T_BYTE && VM_Version::use_neon_for_vector(length_in_bytes)) {
      __ tbl($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
             $src$$FloatRegister, 1, $shuffle$$FloatRegister);
    } else {
      assert(UseSVE > 0, "must be sve");
      BasicType bt_src = Matcher::vector_element_basic_type(this, $src);
      __ sve_tbl($dst$$FloatRegister, __ elemType_to_regVariant(bt_src),
                 $src$$FloatRegister, $shuffle$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector Load Gather ---------------------------

instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) == 4);
  match(Set dst (LoadVectorGather mem idx));
  format %{ "gather_loadS $dst, $mem, $idx\t# vector (sve)" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_ld1w_gather($dst$$FloatRegister, ptrue,
                       as_Register($mem$$base), $idx$$FloatRegister);
 %}
  ins_pipe(pipe_slow);
%}

instruct gather_loadD(vReg dst, indirect mem, vReg idx, vReg tmp) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) == 8);
  match(Set dst (LoadVectorGather mem idx));
  effect(TEMP tmp);
  format %{ "gather_loadD $dst, $mem, $idx\t# vector (sve). KILL $tmp" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_uunpklo($tmp$$FloatRegister, __ D, $idx$$FloatRegister);
    __ sve_ld1d_gather($dst$$FloatRegister, ptrue, as_Register($mem$$base),
                       $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct gather_loadS_masked(vReg dst, indirect mem, vReg idx, pRegGov pg) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) == 4);
  match(Set dst (LoadVectorGatherMasked mem (Binary idx pg)));
  format %{ "gather_loadS_masked $dst, $pg, $mem, $idx" %}
  ins_encode %{
    __ sve_ld1w_gather($dst$$FloatRegister, $pg$$PRegister,
                       as_Register($mem$$base), $idx$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct gather_loadD_masked(vReg dst, indirect mem, vReg idx, pRegGov pg, vReg tmp) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n)) == 8);
  match(Set dst (LoadVectorGatherMasked mem (Binary idx pg)));
  effect(TEMP tmp);
  format %{ "gather_loadD_masked $dst, $pg, $mem, $idx\t# KILL $tmp" %}
  ins_encode %{
    __ sve_uunpklo($tmp$$FloatRegister, __ D, $idx$$FloatRegister);
    __ sve_ld1d_gather($dst$$FloatRegister, $pg$$PRegister,
                       as_Register($mem$$base), $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector Store Scatter -------------------------

instruct scatter_storeS(indirect mem, vReg src, vReg idx) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n->in(3)->in(1))) == 4);
  match(Set mem (StoreVectorScatter mem (Binary src idx)));
  format %{ "scatter_storeS $mem, $idx, $src\t# vector (sve)" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_st1w_scatter($src$$FloatRegister, ptrue,
                        as_Register($mem$$base), $idx$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct scatter_storeD(indirect mem, vReg src, vReg idx, vReg tmp) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n->in(3)->in(1))) == 8);
  match(Set mem (StoreVectorScatter mem (Binary src idx)));
  effect(TEMP tmp);
  format %{ "scatter_storeD $mem, $idx, $src\t# vector (sve). KILL $tmp" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
    __ sve_uunpklo($tmp$$FloatRegister, __ D, $idx$$FloatRegister);
    __ sve_st1d_scatter($src$$FloatRegister, ptrue,
                        as_Register($mem$$base), $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct scatter_storeS_masked(indirect mem, vReg src, vReg idx, pRegGov pg) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n->in(3)->in(1))) == 4);
  match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx pg))));
  format %{ "scatter_storeS_masked $mem, $pg, $idx, $src" %}
  ins_encode %{
    __ sve_st1w_scatter($src$$FloatRegister, $pg$$PRegister,
                        as_Register($mem$$base), $idx$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct scatter_storeD_masked(indirect mem, vReg src, vReg idx, pRegGov pg, vReg tmp) %{
  predicate(UseSVE > 0 &&
            type2aelembytes(Matcher::vector_element_basic_type(n->in(3)->in(1))) == 8);
  match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx pg))));
  effect(TEMP tmp);
  format %{ "scatter_storeD_masked $mem, $pg, $idx, $src\t# KILL $tmp" %}
  ins_encode %{
    __ sve_uunpklo($tmp$$FloatRegister, __ D, $idx$$FloatRegister);
    __ sve_st1d_scatter($src$$FloatRegister, $pg$$PRegister,
                        as_Register($mem$$base), $tmp$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ CountLeadingZerosV ---------------------------

instruct vcountLeadingZeros(vReg dst, vReg src) %{
  match(Set dst (CountLeadingZerosV src));
  format %{ "vcountLeadingZeros $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (bt == T_LONG) {
      if (UseSVE == 0) {
        __ umov(rscratch1, $src$$FloatRegister, __ D, 0);
        __ clz(rscratch1, rscratch1);
        __ mov($dst$$FloatRegister, __ D, 0, rscratch1);
        __ umov(rscratch1, $src$$FloatRegister, __ D, 1);
        __ clz(rscratch1, rscratch1);
        __ mov($dst$$FloatRegister, __ D, 1, rscratch1);
      } else {
        __ sve_clz($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
      }
    } else {
      if (VM_Version::use_neon_for_vector(length_in_bytes)) {
        __ clz($dst$$FloatRegister, get_arrangement(this), $src$$FloatRegister);
      } else {
        assert(UseSVE > 0, "must be sve");
        __ sve_clz($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                   ptrue, $src$$FloatRegister);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// The dst and src should use the same register to make sure the
// inactive lanes in dst save the same elements as src.
UNARY_OP_PREDICATE(vcountLeadingZeros, CountLeadingZerosV, sve_clz)

// ------------------------------ CountTrailingZerosV --------------------------

instruct vcountTrailingZeros(vReg dst, vReg src) %{
  match(Set dst (CountTrailingZerosV src));
  format %{ "vcountTrailingZeros $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (bt == T_BYTE) {
      if (VM_Version::use_neon_for_vector(length_in_bytes)) {
        __ rbit($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
                $src$$FloatRegister);
        __ clz($dst$$FloatRegister, get_arrangement(this), $dst$$FloatRegister);
      } else {
        assert(UseSVE > 0, "must be sve");
        __ sve_rbit($dst$$FloatRegister, __ B, ptrue, $src$$FloatRegister);
        __ sve_clz($dst$$FloatRegister, __ B, ptrue, $dst$$FloatRegister);
      }
    } else {
      assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
      if (UseSVE == 0) {
        assert(length_in_bytes <= 16, "unsupported");
        __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                             bt, /* isQ */ length_in_bytes == 16);
        if (bt != T_LONG) {
          __ clz($dst$$FloatRegister, get_arrangement(this), $dst$$FloatRegister);
        } else {
          __ umov(rscratch1, $dst$$FloatRegister, __ D, 0);
          __ clz(rscratch1, rscratch1);
          __ mov($dst$$FloatRegister, __ D, 0, rscratch1);
          __ umov(rscratch1, $dst$$FloatRegister, __ D, 1);
          __ clz(rscratch1, rscratch1);
          __ mov($dst$$FloatRegister, __ D, 1, rscratch1);
        }
      } else {
        Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
        __ sve_rbit($dst$$FloatRegister, size, ptrue, $src$$FloatRegister);
        __ sve_clz($dst$$FloatRegister, size, ptrue, $dst$$FloatRegister);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// The dst and src should use the same register to make sure the
// inactive lanes in dst save the same elements as src.
instruct vcountTrailingZeros_masked(vReg dst_src, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src (CountTrailingZerosV dst_src pg));
  format %{ "vcountTrailingZeros_masked $dst_src, $pg, $dst_src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_rbit($dst_src$$FloatRegister, size,
                $pg$$PRegister, $dst_src$$FloatRegister);
    __ sve_clz($dst_src$$FloatRegister, size,
               $pg$$PRegister, $dst_src$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ ReverseV -------------------------------------

instruct vreverse(vReg dst, vReg src) %{
  match(Set dst (ReverseV src));
  format %{ "vreverse $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (bt == T_BYTE) {
      if (VM_Version::use_neon_for_vector(length_in_bytes)) {
        __ rbit($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
                $src$$FloatRegister);
      } else {
        assert(UseSVE > 0, "must be sve");
        __ sve_rbit($dst$$FloatRegister, __ B, ptrue, $src$$FloatRegister);
      }
    } else {
      assert(bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported type");
      if (UseSVE == 0) {
        assert(length_in_bytes <= 16, "unsupported");
        __ neon_reverse_bits($dst$$FloatRegister, $src$$FloatRegister,
                             bt, /* isQ */ length_in_bytes == 16);
      } else {
        __ sve_rbit($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                    ptrue, $src$$FloatRegister);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// The dst and src should use the same register to make sure the
// inactive lanes in dst save the same elements as src.
UNARY_OP_PREDICATE(vreverse, ReverseV, sve_rbit)

// ------------------------------ ReverseBytesV --------------------------------

instruct vreverseBytes(vReg dst, vReg src) %{
  match(Set dst (ReverseBytesV src));
  format %{ "vreverseBytes $dst, $src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    if (VM_Version::use_neon_for_vector(length_in_bytes)) {
      assert(length_in_bytes <= 16, "unsupported");
      if (bt == T_BYTE) {
        if ($dst$$FloatRegister != $src$$FloatRegister) {
          __ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
                 $src$$FloatRegister, $src$$FloatRegister);
        }
      } else {
        __ neon_reverse_bytes($dst$$FloatRegister, $src$$FloatRegister,
                              bt, /* isQ */ length_in_bytes == 16);
      }
    } else {
      assert(UseSVE > 0, "must be sve");
      if (bt == T_BYTE) {
        if ($dst$$FloatRegister != $src$$FloatRegister) {
          __ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
        }
      } else {
        __ sve_revb($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                    ptrue, $src$$FloatRegister);
      }
    }
  %}
  ins_pipe(pipe_slow);
%}

// The dst and src should use the same register to make sure the
// inactive lanes in dst save the same elements as src.
instruct vreverseBytes_masked(vReg dst_src, pRegGov pg) %{
  predicate(UseSVE > 0);
  match(Set dst_src (ReverseBytesV dst_src pg));
  format %{ "vreverseBytes_masked $dst_src, $pg, $dst_src" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    if (bt == T_BYTE) {
      // do nothing
    } else {
      __ sve_revb($dst_src$$FloatRegister, __ elemType_to_regVariant(bt),
                  $pg$$PRegister, $dst_src$$FloatRegister);
    }
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Populate Index to a Vector -------------------

instruct populateindex(vReg dst, iRegIorL2I src1, immI src2) %{
  predicate(UseSVE > 0);
  match(Set dst (PopulateIndex src1 src2));
  format %{ "populateindex $dst, $src1, $src2\t # populate index (sve)" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_index($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                 $src1$$Register, (int)($src2$$constant));
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Compress/Expand Operations -------------------

instruct mcompress(pReg dst, pReg pg, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set dst (CompressM pg));
  effect(KILL cr);
  format %{ "mcompress $dst, $pg\t# KILL cr" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ sve_cntp(rscratch1, size, ptrue, $pg$$PRegister);
    __ sve_whilelt(as_PRegister($dst$$reg), size, zr, rscratch1);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
  predicate(UseSVE > 0 &&
            !is_subword_type(Matcher::vector_element_basic_type(n)));
  match(Set dst (CompressV src pg));
  format %{ "vcompress $dst, $src, $pg" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ sve_compact($dst$$FloatRegister, __ elemType_to_regVariant(bt),
                   $src$$FloatRegister, $pg$$PRegister);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2,
                    vReg tmp3, pReg ptmp, pRegGov pgtmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp);
  match(Set dst (CompressV src pg));
  format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    __ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
                         $tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister,
                         $ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}

instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT);
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp);
  match(Set dst (CompressV src pg));
  format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %}
  ins_encode %{
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    __ sve_dup($tmp1$$FloatRegister, __ H, 0);
    __ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
                          $tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister,
                          length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}

instruct vexpand_neon(vReg dst, vReg src, vReg mask, vReg tmp1, vReg tmp2) %{
  predicate(UseSVE == 0);
  match(Set dst (ExpandV src mask));
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
  format %{ "vexpand_neon $dst, $src, $mask\t# KILL $tmp1, $tmp2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    int length_in_bytes = (int) Matcher::vector_length_in_bytes(this);
    __ vector_expand_neon($dst$$FloatRegister, $src$$FloatRegister, $mask$$FloatRegister,
                          $tmp1$$FloatRegister, $tmp2$$FloatRegister, bt, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}

instruct vexpand_sve(vReg dst, vReg src, pRegGov pg, vReg tmp1, vReg tmp2) %{
  predicate(UseSVE == 1 || (UseSVE == 2 && type2aelembytes(Matcher::vector_element_basic_type(n)) < 4));
  match(Set dst (ExpandV src pg));
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
  format %{ "vexpand_sve $dst, $src, $pg\t# KILL $tmp1, $tmp2" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    int length_in_bytes = (int) Matcher::vector_length_in_bytes(this);
    __ vector_expand_sve($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
                         $tmp1$$FloatRegister, $tmp2$$FloatRegister, bt, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}

instruct vexpand_sve2_SD(vReg dst, vReg src, pRegGov pg) %{
  predicate(UseSVE == 2 && type2aelembytes(Matcher::vector_element_basic_type(n)) >= 4);
  match(Set dst (ExpandV src pg));
  effect(TEMP_DEF dst);
  format %{ "vexpand_sve2_SD $dst, $src, $pg" %}
  ins_encode %{
    // Example input:   src   = 1 2 3 4 5 6 7 8
    //                  pg    = 1 0 0 1 1 0 1 1
    // Expected result: dst   = 4 0 0 5 6 0 7 8

    // The basic idea is to use TBL which can shuffle the elements in the given
    // vector flexibly. HISTCNT + SUB is used to generate the second source input
    // for TBL whose value is used to select the indexed element from src vector.

    BasicType bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    // dst = 0 0 0 0 0 0 0 0
    __ sve_dup($dst$$FloatRegister, size, 0);
    // dst = 5 0 0 4 3 0 2 1
    __ sve_histcnt($dst$$FloatRegister, size, $pg$$PRegister,
                   $dst$$FloatRegister, $dst$$FloatRegister);
    // dst = 4 -1 -1 3 2 -1 1 0
    __ sve_sub($dst$$FloatRegister, size, 1);
    // dst = 4 0 0 5 6 0 7 8
    __ sve_tbl($dst$$FloatRegister, size, $src$$FloatRegister, $dst$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}

// ------------------------------ Vector signum --------------------------------

// Vector Math.signum

instruct vsignum_le128b(vReg dst, vReg src, vReg zero, vReg one) %{
  predicate(Matcher::vector_length_in_bytes(n) <= 16);
  match(Set dst (SignumVF src (Binary zero one)));
  match(Set dst (SignumVD src (Binary zero one)));
  effect(TEMP_DEF dst);
  format %{ "vsignum_le128b $dst, $src\t# vector <= 128 bits" %}
  ins_encode %{
    __ vector_signum_neon($dst$$FloatRegister, $src$$FloatRegister, $zero$$FloatRegister,
                          $one$$FloatRegister, get_arrangement(this));
  %}
  ins_pipe(pipe_slow);
%}

instruct vsignum_gt128b(vReg dst, vReg src, vReg zero, vReg one, vReg tmp, pRegGov pgtmp) %{
  predicate(Matcher::vector_length_in_bytes(n) > 16);
  match(Set dst (SignumVF src (Binary zero one)));
  match(Set dst (SignumVD src (Binary zero one)));
  effect(TEMP_DEF dst, TEMP tmp, TEMP pgtmp);
  format %{ "vsignum_gt128b $dst, $src\t# vector > 128 bits. KILL $tmp, $pgtmp" %}
  ins_encode %{
    assert(UseSVE > 0, "must be sve");
    BasicType bt = Matcher::vector_element_basic_type(this);
    __ vector_signum_sve($dst$$FloatRegister, $src$$FloatRegister, $zero$$FloatRegister,
                         $one$$FloatRegister, $tmp$$FloatRegister, $pgtmp$$PRegister,
                         __ elemType_to_regVariant(bt));
  %}
  ins_pipe(pipe_slow);
%}

dnl
dnl BITPERM($1,        $2,      $3  )
dnl BITPERM(insn_name, op_name, insn)
define(`BITPERM', `
instruct $1(vReg dst, vReg src1, vReg src2) %{
  match(Set dst ($2 src1 src2));
  format %{ "$1 $dst, $src1, $src2\t# vector (sve)" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
    __ $3($dst$$FloatRegister, size,
                $src1$$FloatRegister, $src2$$FloatRegister);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
// ---------------------------------- CompressBitsV --------------------------------
BITPERM(vcompressBits, CompressBitsV, sve_bext)

// ----------------------------------- ExpandBitsV ---------------------------------
BITPERM(vexpandBits, ExpandBitsV, sve_bdep)

// ------------------------------------- SelectFromTwoVector ------------------------------------
// The Neon and SVE2 tbl instruction for two vector lookup requires both the source vectors to be
// consecutive. The match rules for SelectFromTwoVector reserve two consecutive vector registers
// for src1 and src2.
// Four combinations of vector registers for vselect_from_two_vectors are chosen at random
// (two from volatile and two from non-volatile set) which gives more freedom to the register
// allocator to choose the best pair of source registers at that point.
dnl
dnl SELECT_FROM_TWO_VECTORS($1,        $2        )
dnl SELECT_FROM_TWO_VECTORS(first_reg, second_reg)
define(`SELECT_FROM_TWO_VECTORS', `
instruct vselect_from_two_vectors_$1_$2(vReg dst, vReg_V$1 src1, vReg_V$2 src2,
                                        vReg index, vReg tmp) %{
  effect(TEMP_DEF dst, TEMP tmp);
  match(Set dst (SelectFromTwoVector (Binary index src1) src2));
  format %{ "vselect_from_two_vectors_$1_$2 $dst, $src1, $src2, $index\t# KILL $tmp" %}
  ins_encode %{
    BasicType bt = Matcher::vector_element_basic_type(this);
    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    __ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
                               $src2$$FloatRegister, $index$$FloatRegister,
                               $tmp$$FloatRegister, bt, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
%}')dnl
dnl
SELECT_FROM_TWO_VECTORS(10, 11)
SELECT_FROM_TWO_VECTORS(12, 13)
SELECT_FROM_TWO_VECTORS(17, 18)
SELECT_FROM_TWO_VECTORS(23, 24)
